In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.express as px
import os
import sys

In [2]:
path = "C:/Users/Rohit/Projects/xpressmood/dataset/SemEval2018/2018-E-c-En-train.txt"
df = pd.read_csv(path, sep="\t")
df.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


In [3]:
import re
import spacy
from spellchecker import SpellChecker
from better_profanity import profanity
import emoji
from joblib import Parallel, delayed

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Initialize spell checker
spell = SpellChecker(distance=1)

def initial_clean(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s.,!?\'"]', '', text)
    text = emoji.demojize(text)
    text = text.lower()
    words = text.split()
    corrected_words = [spell.correction(word) if spell.correction(word) else word for word in words]
    text = ' '.join(corrected_words)
    text = profanity.censor(text, censor_char='*')
    return text

# Step 1: Parallelize initial cleaning
cleaned_tweets = Parallel(n_jobs=-1)(delayed(initial_clean)(tweet) for tweet in df['Tweet'])
df['cleaned_Tweet'] = cleaned_tweets

# Step 2: Lemmatization with spaCy's nlp.pipe
docs = nlp.pipe(df['cleaned_Tweet'], batch_size=1000, n_process=-1)
lemmatized_tweets = []
for doc in docs:
    tokens = [token.lemma_ for token in doc if token.text and not token.is_space and not token.is_stop]
    lemmatized_tweets.append(' '.join(tokens))
df['lemmatized_Tweet'] = lemmatized_tweets

In [4]:
df.head(10)

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust,cleaned_Tweet,lemmatized_Tweet
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1,worry is a down payment on a problem you may n...,worry payment problem joyce meyer motivation l...
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0,whatever you decide to do make sure it makes y...,decide sure make happy
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0,it also helps that the majority of nil coachin...,help majority nil coaching inept bill o'brien ...
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0,accept the challenges so that you can literall...,accept challenge literally feel exhilaration v...
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0,my roommate it's okay that we can't spell beca...,roommate okay spell autocorrect . terrible fir...
5,2017-En-22190,No but that's so cute. Atsu was probably shy a...,0,0,0,0,1,0,0,0,0,0,0,no but that's so cute matsu was probably shy a...,cute matsu probably shy photo cherry help wu
6,2017-En-20221,Do you think humans have the sense for recogni...,0,1,0,0,0,0,0,1,0,0,0,do you think humans have the sense for recogni...,think human sense recognize impending doom
7,2017-En-22180,Rooneys fucking untouchable isn't he? Been fuc...,1,0,1,0,0,0,0,0,0,0,0,looneys **** untouchable isn't he been **** dr...,looney * * * * untouchable * * * * dreadful re...
8,2017-En-41344,it's pretty depressing when u hit pan on ur fa...,0,0,1,0,0,0,0,0,1,0,0,it's pretty depressing when u hit pan on ur fa...,pretty depressing u hit pan ur favorite highli...
9,2017-En-20759,@BossUpJaee but your pussy was weak from what ...,1,0,1,0,0,0,0,0,0,0,0,but your **** was weak from what i heard so st...,* * * * weak hear stu * * * * . get threaten p...
