In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import statsmodels.formula.api as smf
import emoji
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [28]:
df = pd.read_csv('Tweets.csv', encoding='latin1')

In [28]:
# Dropping duplicates and changing column names
df.drop_duplicates(inplace=True)
df = df.set_axis(['target','id','date','flag','user','text'], axis='columns')

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [32]:
# Checking to see if there are any Null values
df.isnull().any().sum()

0

In [40]:
# Convert text to lowercase
df['text'] = df['text'].str.lower()
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@kenichan i dived many times for the ball. man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@kwesidei not the whole crew


In [41]:

# Define a function to remove mentions
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

# Apply the function to the 'text' column
df['text'] = df['text'].apply(remove_mentions)

# Strip any extra spaces that may remain after removing mentions
df['text'] = df['text'].str.strip()

# Display the updated DataFrame
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many times for the ball. managed to sa...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"no, it's not behaving at all. i'm mad. why am ..."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,not the whole crew


In [56]:
# Removing stopwords from the data
stop_words = stopwords.words("english")
df["text"] = df["text"].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

# removing links
df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+|www\.\S+", "", x))

# removing email addresses
df["text"] = df["text"].apply(lambda x: re.sub(r"\w+@\w+\.com", "", x))

# removing punctuation marks
df["text"] = df["text"].apply(lambda x: re.sub(r"[.,;:!\?\"'`]", "", x))

# removing special characters
df["text"] = df["text"].apply(lambda x: re.sub(r"[@#\$%^&*\(\)\\/\+-_=\[\]\{\}<>]", "", x))

# removing unnecessary characters
df["text"] = df["text"].apply(lambda x: re.sub(r"½m|½s|½t|½ï", "", x))


In [131]:
# applying lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
df["text"] = df["text"].apply(lambda x: " ".join(wnl.lemmatize(word, "v") for word in x.split()))

df.head(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset cant update facebook texting it might cr...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dive many time ball manage save rest go bind
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feel itchy like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,no behave all im mad here cant see there
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,need hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,hey long time see yes rain bite only bite lol ...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,nope
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,que muera
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break plain city snow


In [32]:
print('Number of Negative Tweets:', (df['target'] == 0).sum())
print('Number of Postive Tweets:', (df['target'] == 4).sum())

Number of Negative Tweets: 799999
Number of Postive Tweets: 800000


In [58]:
#80/20 training split
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['target'])

In [59]:
# Convert text data into numerical form
vectorizer = CountVectorizer().fit(x_train)
x_train_vectorized = vectorizer.transform(x_train)

In [62]:
# Train Model on Training Data
model = MultinomialNB()
model.fit(x_train_vectorized, y_train)

In [64]:
# Evaluating the model on the test data
prediction = model.predict(vectorizer.transform(x_test))

print('Confusion Matrix: \n', confusion_matrix(y_test, prediction))

print('\nAccuracy: \n', round(accuracy_score(y_test, prediction) * 100, 2), '%')

Confusion Matrix: 
 [[155536  44630]
 [ 50114 149720]]

Accuracy: 
 76.31 %


In [30]:
df2 = pd.read_csv('Tweets.csv', encoding='latin1')

In [31]:
# Dropping duplicates and changing column names
df2.drop_duplicates(inplace=True)
df2 = df2.set_axis(['target','id','date','flag','user','text'], axis='columns')
df2['text'] = df2['text'].str.lower()

In [32]:
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)


In [33]:
df2['text'] = df2['text'].apply(remove_mentions)

def clean_text_v2(text):
    text = re.sub(r"http\S+|www\.\S+", "", text)  # Remove URLs
    text = re.sub(r"\w+@\w+\.com", "", text)     # Remove emails
     # Normalize repeated punctuation (! and ?)
    text = re.sub(r"!{2,}", "!", text)  # Replace multiple exclamation marks with one
    text = re.sub(r"\?{2,}", "?",text)  # Replace multiple question marks with one
    text = re.sub(r"[.,;:\"'`]", "", text)     # Remove punctuation  but keep ! and ?
    text = re.sub(r"[@\$%^&*\(\)\\/\+-_=\[\]\{\}<>]", "", text)  # Remove special chars

    text = emoji.demojize(text, delimiters=(" ", " "))
    return text.strip()
df2['text'] = df2['text'].apply(clean_text_v2)

In [34]:
df2.head(10)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he cant update his facebook by t...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many times for the ball managed to sav...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,no its not behaving at all im mad why am i her...
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,not the whole crew
5,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,need a hug
6,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,hey long time no see! yes rains a bit only a ...
7,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,nope they didnt have it
8,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,que me muera
9,0,1467812416,Mon Apr 06 22:20:16 PDT 2009,NO_QUERY,erinx3leannexo,spring break in plain city its snowing


In [35]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(df2['text'], df2['target'])

In [250]:
tfidf_1 = TfidfVectorizer(input = "content", encoding = "utf-8", decode_error = "strict",
                          strip_accents = None, lowercase = True, preprocessor = None,
                          tokenizer = None, analyzer = "word", stop_words = None,
                          token_pattern = r"(?u)\b\w\w+\b", ngram_range = (1,1),
                          max_df = 1.0, min_df = 1, max_features = 5000, vocabulary = None,
                          binary = False, dtype = np.float64, norm = "l2", use_idf = True,
                          smooth_idf = True, sublinear_tf = False).fit(x_train2)

In [251]:
x_train_vectorized2 = tfidf_1.transform(x_train2)

In [252]:
model2 = MultinomialNB()
model2.fit(x_train_vectorized2, y_train2)

In [253]:
# Evaluating the model on the test data
prediction2 = model2.predict(tfidf_1.transform(x_test2))

print('Confusion Matrix: \n', confusion_matrix(y_test2, prediction2))

print('\nAccuracy: \n', round(accuracy_score(y_test2, prediction2) * 100, 2), '%')

Confusion Matrix: 
 [[155923  43968]
 [ 47864 152245]]

Accuracy: 
 77.04 %


In [238]:
model3 = LogisticRegression(solver='saga',max_iter=1000)
model3.fit(x_train_vectorized2, y_train2)
predictions3 = model3.predict(tfidf_1.transform(x_test2))

print('Confusion Matrix: \n', confusion_matrix(y_test2, predictions3))

print('\nAccuracy: \n', round(accuracy_score(y_test2, predictions3) * 100, 2), '%')

Confusion Matrix: 
 [[155373  44638]
 [ 40019 159970]]

Accuracy: 
 78.84 %


Logistic Regression performs better than NB

In [36]:
# Testing with bi-gram
tfidf_2 = TfidfVectorizer(max_features=20000, ngram_range = (1,2)).fit(x_train2)
x_train_vectorized3 = tfidf_2.transform(x_train2)
x_test_vectorized = tfidf_2.transform(x_test2)

In [37]:
model4 = LogisticRegression(solver='saga',max_iter=1000 )
model4.fit(x_train_vectorized3, y_train2)
predictions4 = model4.predict(x_test_vectorized)

print('Confusion Matrix: \n', confusion_matrix(y_test2, predictions4))

print('\nAccuracy: \n', round(accuracy_score(y_test2, predictions4) * 100, 2), '%')

Confusion Matrix: 
 [[160160  40074]
 [ 34942 164824]]

Accuracy: 
 81.25 %


Logistic Regression using Tfid with bi grams will be the model. Furter accuracy improvement will be done on another notebook