# Sentiment Prediction of text

Uses TFIDF Vectorization to extract features from text corpus and Logistic Regression to predict if the text is positive or negative.

In [1]:
#Importing Libraries
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
df = pd.read_csv('Dataset.csv',encoding='latin-1')
df

Unnamed: 0,Sentiment,ID,Timestamp,Query,Author,Text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [37]:
df = df.drop(['ID','Timestamp','Query','Author'], axis=1)
df

Unnamed: 0,Sentiment,Text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [38]:
#Remove links and special characters
import re
df['Text'] = df['Text'].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ')
df['Text'] = df['Text'].map(lambda x: re.sub(r'\W+', ' ', x))
df

Unnamed: 0,Sentiment,Text
0,0,switchfoot Awww that s a bummer You shoulda g...
1,0,is upset that he can t update his Facebook by ...
2,0,Kenichan I dived many times for the ball Mana...
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no it s not behaving at all i...
...,...,...
1599995,4,Just woke up Having no school is the best feel...
1599996,4,TheWDB com Very cool to hear old Walt intervie...
1599997,4,Are you ready for your MoJo Makeover Ask me fo...
1599998,4,Happy 38th Birthday to my boo of alll time Tup...


In [39]:
#Convert to lowercase
df['Text'] = df['Text'].str.lower()
df

Unnamed: 0,Sentiment,Text
0,0,switchfoot awww that s a bummer you shoulda g...
1,0,is upset that he can t update his facebook by ...
2,0,kenichan i dived many times for the ball mana...
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no it s not behaving at all i...
...,...,...
1599995,4,just woke up having no school is the best feel...
1599996,4,thewdb com very cool to hear old walt intervie...
1599997,4,are you ready for your mojo makeover ask me fo...
1599998,4,happy 38th birthday to my boo of alll time tup...


In [41]:
#Shuffle rows and select first 500,000 rows since the total dataset is too big to train
df = df.sample(frac=1)
df = df.iloc[:500000]
df

Unnamed: 0,Sentiment,Text
357600,0,serenetan maynaseric simontay78 nolar hehe no...
330197,0,i wanted to see adventureland and it never cam...
1115481,4,jaqstone thanx for the ff love
101883,0,im sad vic is leaving for texas today cuz ima ...
1335025,4,bayboy04 i m good sweetie how r u doing this ...
...,...,...
773378,0,sarahrosemusic i was but my mom said no
1018213,4,robineccles thank you not nice to be called a...
202863,0,lychee juice significantly improves my usmle p...
1291479,4,just phoned paston ridings for some work exper...


In [42]:
#Initialize stop words and TFIF Vectorizer
stop_words = set(stopwords.words('english'))
vect = TfidfVectorizer(min_df=5, max_features=2000).fit(df['Text'])
X_train_vectorized = vect.transform(df['Text'])

In [43]:
from sklearn.model_selection import train_test_split
XTrain,XTest,YTrain,YTest = train_test_split(X_train_vectorized,df['Sentiment'],test_size=0.2)

In [None]:
Pkl_Filename = "Logistic.pkl" 
with open(Pkl_Filename, 'rb') as file:  
    model = pickle.load(file)

Run the following 3 cells if you want to re-train the model

In [44]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(verbose=1, n_jobs=3)
model.fit(XTrain,YTrain)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    6.7s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=3, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)

In [45]:
model.score(XTest,YTest)

0.77811

In [20]:
Pkl_Filename = "Logistic.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

# Predict sentiment for new text

In [None]:

new_text = input()

new_t = vect.transform([new_text])
print('Negative' if model1.predict(new_t) == 0 else 'Positive')