In [1]:

import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt 


In [2]:
data = pd.read_csv("Acc 84/TwitterDataset.csv",encoding='latin-1')
data.head()

Unnamed: 0,target,tweet,username
0,1,the real reason why you're sad? you're attache...,depressingmsgs
1,1,my biggest problem is overthinking everything,depressingmsgs
2,1,the worst sadness is the sadness you've taught...,depressingmsgs
3,1,i cannot make you understand. i cannot make an...,depressingmsgs
4,1,i don't think anyone really understands how ti...,depressingmsgs


In [3]:
DATASET_COLUMNS = ["target", "TweetText", "user"]
data.columns = DATASET_COLUMNS

In [4]:
data.describe(include='all')

Unnamed: 0,target,TweetText,user
count,8135.0,8135,8135
unique,,7767,12
top,,do you ever feel ok but your sad at the same t...,depressingmsgs
freq,,10,998
mean,0.422864,,
std,0.494045,,
min,0.0,,
25%,0.0,,
50%,0.0,,
75%,1.0,,


In [5]:
data.dtypes

target        int64
TweetText    object
user         object
dtype: object

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8135 entries, 0 to 8134
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   target     8135 non-null   int64 
 1   TweetText  8135 non-null   object
 2   user       8135 non-null   object
dtypes: int64(1), object(2)
memory usage: 190.8+ KB


In [7]:
# Removing Twitter Handles (@user)
data['Clean_TweetText'] = data['TweetText'].str.replace("@", "") 
# Removing links
data['Clean_TweetText'] = data['Clean_TweetText'].str.replace(r"http\S+", "") 
# Removing Punctuations, Numbers, and Special Characters
data['Clean_TweetText'] = data['Clean_TweetText'].str.replace("[^a-zA-Z]", " ") 
# Remove stop words
import nltk
stopwords=nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    clean_text=' '.join([word for word in text.split() if word not in stopwords])
    return clean_text
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda text : remove_stopwords(text.lower()))
data.head()

  data['Clean_TweetText'] = data['Clean_TweetText'].str.replace(r"http\S+", "")
  data['Clean_TweetText'] = data['Clean_TweetText'].str.replace("[^a-zA-Z]", " ")


Unnamed: 0,target,TweetText,user,Clean_TweetText
0,1,the real reason why you're sad? you're attache...,depressingmsgs,real reason sad attached people distant paying...
1,1,my biggest problem is overthinking everything,depressingmsgs,biggest problem overthinking everything
2,1,the worst sadness is the sadness you've taught...,depressingmsgs,worst sadness sadness taught hide
3,1,i cannot make you understand. i cannot make an...,depressingmsgs,cannot make understand cannot make anyone unde...
4,1,i don't think anyone really understands how ti...,depressingmsgs,think anyone really understands tiring act oka...


In [8]:
# Text Tokenization and Normalization
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda x: nltk.word_tokenize(x))
data.head()

Unnamed: 0,target,TweetText,user,Clean_TweetText
0,1,the real reason why you're sad? you're attache...,depressingmsgs,"[real, reason, sad, attached, people, distant,..."
1,1,my biggest problem is overthinking everything,depressingmsgs,"[biggest, problem, overthinking, everything]"
2,1,the worst sadness is the sadness you've taught...,depressingmsgs,"[worst, sadness, sadness, taught, hide]"
3,1,i cannot make you understand. i cannot make an...,depressingmsgs,"[can, not, make, understand, can, not, make, a..."
4,1,i don't think anyone really understands how ti...,depressingmsgs,"[think, anyone, really, understands, tiring, a..."


In [9]:
# Now let’s stitch these tokens back together
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda x: ' '.join([w for w in x]))
# Removing small words
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
data.head()

Unnamed: 0,target,TweetText,user,Clean_TweetText
0,1,the real reason why you're sad? you're attache...,depressingmsgs,real reason attached people distant paying att...
1,1,my biggest problem is overthinking everything,depressingmsgs,biggest problem overthinking everything
2,1,the worst sadness is the sadness you've taught...,depressingmsgs,worst sadness sadness taught hide
3,1,i cannot make you understand. i cannot make an...,depressingmsgs,make understand make anyone understand happeni...
4,1,i don't think anyone really understands how ti...,depressingmsgs,think anyone really understands tiring okay al...


In [10]:
corpus = list(data['Clean_TweetText'])

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000)
X = cv.fit_transform(corpus).toarray()
y = data['target'].values

In [12]:
cv.get_feature_names()



['ability',
 'able',
 'absolutely',
 'accept',
 'account',
 'achieve',
 'acting',
 'action',
 'actions',
 'actsoffaith',
 'actually',
 'admit',
 'advice',
 'affirmation',
 'afraid',
 'aired',
 'alarm',
 'alive',
 'almost',
 'alone',
 'already',
 'also',
 'always',
 'amazing',
 'amazon',
 'america',
 'american',
 'android',
 'anger',
 'angry',
 'annoying',
 'another',
 'answer',
 'anti',
 'anxiety',
 'anybody',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'apart',
 'appreciate',
 'april',
 'around',
 'asked',
 'asking',
 'asks',
 'asleep',
 'attention',
 'audio',
 'austin',
 'authorities',
 'available',
 'avoid',
 'awake',
 'awakenings',
 'awakeningsapp',
 'away',
 'awesome',
 'baby',
 'back',
 'badly',
 'based',
 'basically',
 'beat',
 'beautiful',
 'became',
 'become',
 'becoming',
 'behind',
 'believe',
 'beloveds',
 'best',
 'better',
 'beyond',
 'biden',
 'bigger',
 'biggest',
 'billion',
 'birthday',
 'bitch',
 'bitcoin',
 'black',
 'body',
 'book',
 'born',
 'both

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer()
X = tf_transformer.fit_transform(X).toarray()

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(max_features =1000)
X = tfidfVectorizer.fit_transform(corpus).toarray()

In [15]:
from sklearn.model_selection import train_test_split
X_train_s, X_test_s , y_train_s, y_test_s = train_test_split(X, y , test_size = 0.20, random_state=101)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
# Naive Bayes is a statistical classification technique based on Bayes Theorem
# common classifier used in sentiment analysis is the Naive Bayes Classifier.
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier # this is experimental
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

classifiers = [GradientBoostingClassifier(),GaussianNB(),HistGradientBoostingClassifier(),
               RandomForestClassifier(),LogisticRegression(),XGBClassifier(),LGBMClassifier(),
               CatBoostClassifier(verbose=0),DecisionTreeClassifier(),KNeighborsClassifier(),SVC()]



In [17]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
for classifier in classifiers:
    print(f'Training Model {classifier} \n--------------')
    classifier.fit(X_train_s,y_train_s)
    y_pred = classifier.predict(X_test_s)
    print(f'Training Accuracy: {classifier.score(X_train_s, y_train_s)}')
    print(f'Testing Accuracy: {accuracy_score(y_test_s, y_pred)}')
    print(f'Testing Confusion Matrix: \n{confusion_matrix(y_test_s, y_pred)}')
    print(classification_report(y_test_s, y_pred))
    print('-'*30)

Training Model GradientBoostingClassifier() 
--------------
Training Accuracy: 0.7678242163491088
Testing Accuracy: 0.7252612169637369
Testing Confusion Matrix: 
[[891  53]
 [394 289]]
              precision    recall  f1-score   support

           0       0.69      0.94      0.80       944
           1       0.85      0.42      0.56       683

    accuracy                           0.73      1627
   macro avg       0.77      0.68      0.68      1627
weighted avg       0.76      0.73      0.70      1627

------------------------------
Training Model GaussianNB() 
--------------
Training Accuracy: 0.7460049170251998
Testing Accuracy: 0.7320221266133989
Testing Confusion Matrix: 
[[537 407]
 [ 29 654]]
              precision    recall  f1-score   support

           0       0.95      0.57      0.71       944
           1       0.62      0.96      0.75       683

    accuracy                           0.73      1627
   macro avg       0.78      0.76      0.73      1627
weighted avg    

In [22]:

from sklearn.neural_network import MLPClassifier

model_mlp= MLPClassifier(random_state=1, max_iter=500, alpha=0.005)
model_mlp.fit(X_train_s,y_train_s)
y_pred = model_mlp.predict(X_test_s)
print(f'Training Accuracy: {classifier.score(X_train_s, y_train_s)}')
print(f'Testing Accuracy: {accuracy_score(y_test_s, y_pred)}')
print(f'Testing Confusion Matrix: \n{confusion_matrix(y_test_s, y_pred)}')
print(classification_report(y_test_s, y_pred))

Training Accuracy: 0.9383835279655808
Testing Accuracy: 0.8100799016594961
Testing Confusion Matrix: 
[[766 178]
 [131 552]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       944
           1       0.76      0.81      0.78       683

    accuracy                           0.81      1627
   macro avg       0.81      0.81      0.81      1627
weighted avg       0.81      0.81      0.81      1627

