In [1]:
# System
import os
#nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Time
import time
import datetime

# Numerical
import numpy as np
import pandas as pd

# Tools
import itertools
from collections import Counter
from wordcloud import WordCloud,STOPWORDS

# NLP
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from scipy.sparse import hstack
# from pywsd.utils import lemmatize_sentence

# Preprocessing
from sklearn import preprocessing
from sklearn.utils import class_weight as cw
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, ENGLISH_STOP_WORDS, TfidfTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from bs4 import BeautifulSoup
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
import re

# Model Selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,cross_val_predict

# Machine Learning Models
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline

# Evaluation Metrics
from sklearn import metrics 
from sklearn.metrics import f1_score, accuracy_score,confusion_matrix,classification_report, roc_auc_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_csv('data/twitter_sentiment_data.csv')

### Pre-processing

In [3]:
df_pr=df
#Beautiful Soup is a library that makes it easy to scrape information from web pages.
#It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.
df_pr["message"] = df_pr["message"].apply(lambda x: BeautifulSoup(x, "lxml").get_text())
#print(df_pr["message"])

In [4]:
df_pr["message"] = df_pr["message"].apply(lambda x: x.lower())
#print(df_pr["message"]) 

In [5]:
# remove URLs, RTs, and twitter handles
for i in range(len(df_pr['message'])):
    df_pr['message'][i] = " ".join([word for word in df_pr['message'][i].split()
                                if 'http' not in word and '@' not in word and '<' not in word and word != 'rt'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
#Regex
#Returning the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl
#replace whatever except alphabets to space
df_pr["message"] = df_pr["message"].apply(lambda x: re.sub("https?://[^a-zA-Z]", " ", x))
# removing some punctuations
df_pr["message"] = df_pr["message"].apply(lambda x: re.sub('[!@#$:).;,?&]', '', x))
#replace tab with space
df_pr["message"] = df_pr["message"].apply(lambda x: re.sub("\s+", " ", x))

In [7]:
# creating a pre-processed dataframe and renaming columns to ease the analysis
clean_df=df_pr.rename(columns={"message": "text", "sentiment": "target"})
clean_df=clean_df[["text","target"]]
clean_df.head()

Unnamed: 0,text,target
0,climate change is an interesting hustle as it ...,-1
1,watch beforetheflood right here as travels the...,1
2,fabulous leonardo dicaprio's film on climate c...,1
3,just watched this amazing documentary by leona...,1
4,pranita biswasi a lutheran from odisha gives t...,2


### Modeling

### Basic Logistic Regression with tfidf vectorizer

In [9]:
# Define the vector of labels and matrix of features
y = clean_df["target"]
X = clean_df["text"]

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)
vct_tweet=TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)

# Create sparse matrix from the tfidf vectorizer
tfidf_train_x = vct_tweet.fit_transform(X_train)
tfidf_test_x = vct_tweet.fit_transform(X_test)

# Build a logistic regression model and print out the accuracy
log_reg = LogisticRegression()
log_reg.fit(tfidf_train_x,y_train)
#ypred=log_reg.predict(tfidf_test_x)
#print (tfidf_test_x.shape)
scores = cross_val_score(log_reg,tfidf_test_x, y_test)
acc = scores.mean()
print ("Accuracy: %0.2f percent" % (acc *100))
#print("Summary"+classification_report(y_test, ypred))

Accuracy: 64.20 percent


### SGD CLassifier

In [32]:
# Creating a pipeline and running SGDCLassifer
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),
])
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)
f1_score(y_test,y_predict,average='micro')

0.702241438161338

### Comparing and tuning Logistic Regression with penalties

In [14]:
#set up X and y
X_train = clean_df['text']
y_train = clean_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)
#set up baseline 
print (y_train.value_counts(normalize=True))
baseline = 0.3

 1    0.523440
 2    0.210673
 0    0.175286
-1    0.090601
Name: target, dtype: float64


In [15]:
lr = LogisticRegression(random_state=50)

In [16]:
# initalise the vectorizer 
vector_twt = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
# fit the training data on the model
vector_twt.fit(X_train)

#transform training data into sparse matrix
X_train_twt = vector_twt.transform(X_train)

# cross val score/ predict for the four categories
twt_score = cross_val_score(lr, X_train_twt, y_train, cv=4)
tf_sc=twt_score.mean()
print ("Tfidf Vectorizer Score: %0.2f percent" % (tf_sc *100))


Tfidf Vectorizer Score: 68.65 percent


In [17]:
# initalise the count vectorizer 
cvector_twt = CountVectorizer(stop_words=stopwords.words('english'))
# fit the training data on the model
cvector_twt.fit(X_train)

#transform training data into sparse matrix
X_train_cvt = cvector_twt.transform(X_train)

# cross val score/ predict
cvct_score = cross_val_score(lr, X_train_cvt, y_train, cv=4)
cv_sc=cvct_score.mean()
print ("Tfidf Vectorizer Score: %0.2f percent" % (cv_sc *100))

Tfidf Vectorizer Score: 71.06 percent


In [24]:
# Transform features 
cvect = CountVectorizer(ngram_range=(1,4)) 
cvect.fit(X_train)
X_train_cvect= cvect.transform(X_train)
X_test_cvect= cvect.transform(X_test)
# fit with l1 
model_l1 = LogisticRegressionCV(Cs=np.logspace(-10,10,21),penalty = 'l1',solver='liblinear',cv=3) 
model_l1.fit(X_train_cvect, y_train)

print("Cross Validation Score:", cross_val_score(model_l1,X_train_cvect,y_train))

Cross Validation Score: [0.71678471 0.7154194  0.71876067]


In [34]:
acc_l1=(cross_val_score(model_l1,X_train_cvect,y_train)).mean()
print("Accuracy with L1 Penalty: %0.2f percent" % (acc_l1 *100))

Accuracy with L1 Penalty: 71.67 percent


In [26]:
# fit with l2
model_l2 = LogisticRegressionCV(Cs=np.logspace(-10,10,21),penalty = 'l2',solver='liblinear',cv=3) 
model_l2.fit(X_train_cvect, y_train)

print("Cross Validation Score:", cross_val_score(model_l2,X_train_cvect,y_train))

Cross Validation Score: [0.73248571 0.7281338  0.73173438]


In [35]:
acc_l2=(cross_val_score(model_l2,X_train_cvect,y_train)).mean()
print("Accuracy with L1 Penalty: %0.2f percent" % (acc_l2 *100))

Accuracy with L1 Penalty: 73.08 percent


In [37]:
# Random Forest Classifier
y = clean_df["target"]
X = clean_df["text"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# define stop words
stopwords = stopwords.words('english')

#remove stopwords
vectorizer = CountVectorizer(stop_words=stopwords)

X = vectorizer.fit_transform(X)
# running a random forest on the dataset by removing stopwords using the countverctorizer
model3 = RandomForestClassifier(n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=50)

#Evaluate a score by cross-validation
print("Cross Validation Score:",cross_val_score(model, X, y, cv=3))
accu_RF=(cross_val_score(model, X, y, cv=3)).mean()
print("Accuracy For the Random Forest: %0.2f percent" % (accu_RF *100))

Cross Validation Score: [0.62008465 0.58178591 0.57445211]
Accuracy For the Random Forest: 59.21 percent
