In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\parma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [5]:
dataset=pd.read_csv(r'C:\Users\parma\OneDrive\Desktop\dataScience\twitterSentimentAnalysis\dataset.csv', encoding='ISO-8859-1')

In [6]:
dataset.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset.columns = col_names

In [8]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [9]:
dataset.shape

(1599999, 6)

In [10]:
#checking for missing values
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [11]:
#distribution of tweets
dataset['target'].value_counts()

4    800000
0    799999
Name: target, dtype: int64

In [12]:
#converting 0 to -ve and 4 to +ve
dataset['target'] = dataset['target'].map({0:0, 4:1})

In [13]:
dataset['target'].value_counts()

1    800000
0    799999
Name: target, dtype: int64

In [35]:
important_words = {
    'no', 'not', 'nor', "don't", "doesn't", "didn't", "hadn't",
    "hasn't", "haven't", "isn't", "mightn't", "mustn't",
    "needn't", "shan't", "shouldn't", "wasn't", "weren't", 
    "won't", "wouldn't", "can't", "couldn't", "ain", "aren", "doesn",
    "didn", "hadn", "hasn", "haven", "isn", "mightn", "mustn", 
    "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn",
    "cannot", "never"
}


In [38]:
#stemming

stemmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [
        stemmer.stem(word)
        for word in stemmed_content
        if word.lower() not in stopwords.words('english') or word.lower() in important_words
    ]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content
                             

In [39]:
dataset['text'] = dataset['text'].apply(stemming)

In [40]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,kwesidei whole crew


In [41]:
x = dataset['text']
y = dataset['target']

In [42]:
#splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [43]:
#convert textual data to numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [44]:
print(x_train)

  (0, 144231)	0.48329467052638964
  (0, 381172)	0.3864905771519735
  (0, 158895)	0.1896257191583114
  (0, 419362)	0.4214045286246831
  (0, 244172)	0.5162122830998325
  (0, 390360)	0.18633205668988093
  (0, 148279)	0.1260218447188475
  (0, 149175)	0.18775428169563985
  (0, 440386)	0.2262380889395402
  (1, 170569)	0.6067508917591006
  (1, 414651)	0.7948920400592373
  (2, 403636)	0.6931772675970436
  (2, 273486)	0.37697192470536717
  (2, 288311)	0.24840874494201365
  (2, 149261)	0.20986033478871757
  (2, 41848)	0.5211997496059727
  (3, 173661)	0.30748517599490516
  (3, 88618)	0.5137978725482596
  (3, 134011)	0.18397346866370848
  (3, 290098)	0.33523441012020005
  (3, 396636)	0.21912425499180688
  (3, 314840)	0.5137978725482596
  (3, 173664)	0.42805674021867435
  (4, 405240)	0.1474702401054899
  (4, 297855)	0.20648499733882067
  :	:
  (1279995, 100624)	0.8081378385538067
  (1279995, 246844)	0.5889934073464502
  (1279996, 275098)	0.693054651901408
  (1279996, 132561)	0.34542389125369954
  (

In [45]:
#Trainign the model
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
#testing model
y_pred = model.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.776721875


In [47]:
#function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [52]:
#testing the model
print(predict_sentiment("i dont like this book"))
print(predict_sentiment("I love mumbai"))

Negative
Positive


In [49]:
#save the model
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [50]:
pickle.dump(vectorizer, open('vectorizer.pkl','wb'))