## Sentter
##### The objective of this project is to predict the sentiment of the given statement. A voting classifier is designed with the help of three classifiers.

###### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.externals import joblib

In [2]:
cols = ['sentiment', 'id', 'date', 'query_string', 'user', 'text']

###### Importing the dataset

In [3]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding = 'ISO-8859-1', header = None, names = cols)

###### Here we see that, there are 80,00,000 positive and 80,00,000 negative pre-labeled tweets

In [4]:
df.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [5]:
df.drop(['id', 'date', 'query_string', 'user'], axis = 1, inplace = True)

###### Finding the length of every tweet

In [6]:
df['pre_clean_len'] = [len(t) for t in df.text]

In [7]:
df['sentiment'] = df['sentiment'].map({0:0, 4:1})

###### Pre-processing of tweets

In [8]:
tok = WordPunctTokenizer()

In [9]:
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
        
    except:
        clean = stripped
    
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [10]:
%%time
print ("Cleaning and parsing the tweets...\n")
clean_tweet_texts = []
for i in range(1600000):
    
    clean_tweet_texts.append(tweet_cleaner(df['text'][i]))

Cleaning and parsing the tweets...



  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)


Wall time: 8min 49s


In [11]:
clean_df = pd.DataFrame(clean_tweet_texts, columns=['text'])
clean_df['target'] = df.sentiment
clean_df.head()

Unnamed: 0,text,target
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


###### Saving the cleaned tweets to a new csv file

In [12]:
clean_df.to_csv('clean_tweet.csv', encoding='utf-8')

In [13]:
df = pd.read_csv("clean_tweet.csv")

In [14]:
df = df.dropna()

###### TF-IDF Vectorizer

In [15]:
corpus = df['text']

In [16]:
vectorizer = TfidfVectorizer()

In [17]:
X_vect = vectorizer.fit_transform(corpus)

In [18]:
y = df['target']

###### Splitting the data into training, testing and validation sets

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.04, random_state=42)

In [20]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [21]:
X_train.shape

(1532882, 273694)

In [22]:
X_test.shape

(31935, 273694)

In [23]:
X_val.shape

(31936, 273694)

###### Creating a Logistic Regression Model

In [24]:
lr = LogisticRegression(random_state=42, verbose=1)

In [25]:
lr.fit(X_train, y_train)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)

In [26]:
y_pred_lr = lr.predict(X_test)

In [27]:
y_array_lr = y_test.as_matrix(columns=None)

  """Entry point for launching an IPython kernel.


In [28]:
same_lr = 0
for i in range(y_test.count()):
    if(y_array_lr[i] == y_pred_lr[i]):
        same_lr = same_lr+1
same_lr

25641

In [29]:
accuracy_lr = (same_lr) / float(y_test.count())
accuracy_lr

0.8029121653358384

### The accuracy of Logistic Regression model is 80.29%

In [30]:
print(classification_report(y_test, y_pred_lr))

             precision    recall  f1-score   support

          0       0.81      0.79      0.80     15868
          1       0.80      0.81      0.81     16067

avg / total       0.80      0.80      0.80     31935



###### Creating a Multinomial Naive Bayes Model

In [31]:
mnb = MultinomialNB()

In [32]:
%%time
mnb.fit(X_train, y_train)

Wall time: 470 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
y_pred_mnb = mnb.predict(X_test)

In [34]:
y_array_mnb = y_test.as_matrix(columns=None)

  """Entry point for launching an IPython kernel.


In [35]:
same_mnb = 0
for i in range(y_test.count()):
    if(y_array_mnb[i] == y_pred_mnb[i]):
        same_mnb = same_mnb+1
same_mnb

24750

In [36]:
accuracy_mnb = (same_mnb) / float(y_test.count())
accuracy_mnb

0.7750117426021607

### The accuracy of Logistic Regression model is 77.50%

In [37]:
print(classification_report(y_test, y_pred_mnb))

             precision    recall  f1-score   support

          0       0.76      0.79      0.78     15868
          1       0.79      0.76      0.77     16067

avg / total       0.78      0.78      0.77     31935



###### Creating a Stochastic Gradient Descent Model

In [38]:
sgd = linear_model.SGDClassifier(verbose=2, max_iter=200, loss='log')

In [39]:
%%time
sgd.fit(X_train, y_train)

-- Epoch 1
Norm: 28.41, NNZs: 267036, Bias: 0.289673, T: 1532882, Avg. loss: 0.515293
Total training time: 0.57 seconds.
-- Epoch 2
Norm: 28.41, NNZs: 267036, Bias: 0.291307, T: 3065764, Avg. loss: 0.514084
Total training time: 1.16 seconds.
-- Epoch 3
Norm: 28.40, NNZs: 267036, Bias: 0.294371, T: 4598646, Avg. loss: 0.513916
Total training time: 1.77 seconds.
-- Epoch 4
Norm: 28.39, NNZs: 267036, Bias: 0.291386, T: 6131528, Avg. loss: 0.513876
Total training time: 2.36 seconds.
-- Epoch 5
Norm: 28.38, NNZs: 267036, Bias: 0.293674, T: 7664410, Avg. loss: 0.513868
Total training time: 2.94 seconds.
-- Epoch 6
Norm: 28.39, NNZs: 267036, Bias: 0.291144, T: 9197292, Avg. loss: 0.513846
Total training time: 3.52 seconds.
-- Epoch 7
Norm: 28.39, NNZs: 267036, Bias: 0.291457, T: 10730174, Avg. loss: 0.513835
Total training time: 4.15 seconds.
-- Epoch 8
Norm: 28.38, NNZs: 267036, Bias: 0.292255, T: 12263056, Avg. loss: 0.513826
Total training time: 4.77 seconds.
-- Epoch 9
Norm: 28.38, NNZs: 

Norm: 28.38, NNZs: 267036, Bias: 0.292426, T: 104235976, Avg. loss: 0.513769
Total training time: 42.20 seconds.
-- Epoch 69
Norm: 28.38, NNZs: 267036, Bias: 0.292432, T: 105768858, Avg. loss: 0.513770
Total training time: 42.82 seconds.
-- Epoch 70
Norm: 28.38, NNZs: 267036, Bias: 0.292363, T: 107301740, Avg. loss: 0.513770
Total training time: 43.46 seconds.
-- Epoch 71
Norm: 28.38, NNZs: 267036, Bias: 0.292378, T: 108834622, Avg. loss: 0.513771
Total training time: 44.06 seconds.
-- Epoch 72
Norm: 28.38, NNZs: 267036, Bias: 0.292407, T: 110367504, Avg. loss: 0.513769
Total training time: 44.69 seconds.
-- Epoch 73
Norm: 28.38, NNZs: 267036, Bias: 0.292402, T: 111900386, Avg. loss: 0.513769
Total training time: 45.33 seconds.
-- Epoch 74
Norm: 28.38, NNZs: 267036, Bias: 0.292435, T: 113433268, Avg. loss: 0.513768
Total training time: 45.93 seconds.
-- Epoch 75
Norm: 28.38, NNZs: 267036, Bias: 0.292396, T: 114966150, Avg. loss: 0.513769
Total training time: 46.54 seconds.
-- Epoch 76


Norm: 28.38, NNZs: 267036, Bias: 0.292405, T: 205406188, Avg. loss: 0.513765
Total training time: 81.86 seconds.
-- Epoch 135
Norm: 28.38, NNZs: 267036, Bias: 0.292458, T: 206939070, Avg. loss: 0.513764
Total training time: 82.43 seconds.
-- Epoch 136
Norm: 28.38, NNZs: 267036, Bias: 0.292433, T: 208471952, Avg. loss: 0.513765
Total training time: 82.98 seconds.
-- Epoch 137
Norm: 28.38, NNZs: 267036, Bias: 0.292430, T: 210004834, Avg. loss: 0.513765
Total training time: 83.53 seconds.
-- Epoch 138
Norm: 28.38, NNZs: 267036, Bias: 0.292394, T: 211537716, Avg. loss: 0.513766
Total training time: 84.17 seconds.
-- Epoch 139
Norm: 28.38, NNZs: 267036, Bias: 0.292405, T: 213070598, Avg. loss: 0.513766
Total training time: 84.78 seconds.
-- Epoch 140
Norm: 28.38, NNZs: 267036, Bias: 0.292376, T: 214603480, Avg. loss: 0.513766
Total training time: 85.45 seconds.
-- Epoch 141
Norm: 28.38, NNZs: 267036, Bias: 0.292408, T: 216136362, Avg. loss: 0.513765
Total training time: 86.03 seconds.
-- Ep

Norm: 28.38, NNZs: 267036, Bias: 0.292397, T: 305043518, Avg. loss: 0.513765
Total training time: 121.44 seconds.
-- Epoch 200
Norm: 28.38, NNZs: 267036, Bias: 0.292394, T: 306576400, Avg. loss: 0.513764
Total training time: 122.02 seconds.
Wall time: 2min 2s


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=200, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=2, warm_start=False)

In [40]:
y_pred_sgd = sgd.predict(X_test)

In [41]:
sgd.score(X_test, y_test)

0.7753248786597777

### The accuracy of Logistic Regression model is 77.53%

In [42]:
print(classification_report(y_test, y_pred_sgd))

             precision    recall  f1-score   support

          0       0.78      0.76      0.77     15868
          1       0.77      0.79      0.78     16067

avg / total       0.78      0.78      0.78     31935



### Creating a Voting Classifier using Ensemble Classification Technique

In [43]:
from sklearn.ensemble import VotingClassifier

In [44]:
model_vc_1 = VotingClassifier(estimators=[('lr', lr), ('mnb', mnb), ('sgd', sgd)], voting='soft')

In [45]:
%%time
model_vc_1.fit(X_train, y_train)

[LibLinear]-- Epoch 1
Norm: 28.41, NNZs: 267036, Bias: 0.292537, T: 1532882, Avg. loss: 0.515294
Total training time: 0.56 seconds.
-- Epoch 2
Norm: 28.39, NNZs: 267036, Bias: 0.291639, T: 3065764, Avg. loss: 0.514029
Total training time: 1.13 seconds.
-- Epoch 3
Norm: 28.39, NNZs: 267036, Bias: 0.289855, T: 4598646, Avg. loss: 0.513956
Total training time: 1.70 seconds.
-- Epoch 4
Norm: 28.39, NNZs: 267036, Bias: 0.291020, T: 6131528, Avg. loss: 0.513894
Total training time: 2.27 seconds.
-- Epoch 5
Norm: 28.39, NNZs: 267036, Bias: 0.291986, T: 7664410, Avg. loss: 0.513846
Total training time: 2.84 seconds.
-- Epoch 6
Norm: 28.39, NNZs: 267036, Bias: 0.290954, T: 9197292, Avg. loss: 0.513854
Total training time: 3.47 seconds.
-- Epoch 7
Norm: 28.38, NNZs: 267036, Bias: 0.293357, T: 10730174, Avg. loss: 0.513831
Total training time: 4.06 seconds.
-- Epoch 8
Norm: 28.39, NNZs: 267036, Bias: 0.290142, T: 12263056, Avg. loss: 0.513845
Total training time: 4.62 seconds.
-- Epoch 9
Norm: 28

Norm: 28.38, NNZs: 267036, Bias: 0.292545, T: 104235976, Avg. loss: 0.513768
Total training time: 39.21 seconds.
-- Epoch 69
Norm: 28.38, NNZs: 267036, Bias: 0.292574, T: 105768858, Avg. loss: 0.513770
Total training time: 39.79 seconds.
-- Epoch 70
Norm: 28.38, NNZs: 267036, Bias: 0.292516, T: 107301740, Avg. loss: 0.513769
Total training time: 40.38 seconds.
-- Epoch 71
Norm: 28.38, NNZs: 267036, Bias: 0.292493, T: 108834622, Avg. loss: 0.513771
Total training time: 40.95 seconds.
-- Epoch 72
Norm: 28.38, NNZs: 267036, Bias: 0.292577, T: 110367504, Avg. loss: 0.513768
Total training time: 41.64 seconds.
-- Epoch 73
Norm: 28.38, NNZs: 267036, Bias: 0.292493, T: 111900386, Avg. loss: 0.513769
Total training time: 42.30 seconds.
-- Epoch 74
Norm: 28.38, NNZs: 267036, Bias: 0.292472, T: 113433268, Avg. loss: 0.513769
Total training time: 42.87 seconds.
-- Epoch 75
Norm: 28.38, NNZs: 267036, Bias: 0.292470, T: 114966150, Avg. loss: 0.513769
Total training time: 43.44 seconds.
-- Epoch 76


Norm: 28.38, NNZs: 267036, Bias: 0.292385, T: 205406188, Avg. loss: 0.513766
Total training time: 77.32 seconds.
-- Epoch 135
Norm: 28.38, NNZs: 267036, Bias: 0.292400, T: 206939070, Avg. loss: 0.513766
Total training time: 77.90 seconds.
-- Epoch 136
Norm: 28.38, NNZs: 267036, Bias: 0.292408, T: 208471952, Avg. loss: 0.513765
Total training time: 78.46 seconds.
-- Epoch 137
Norm: 28.38, NNZs: 267036, Bias: 0.292388, T: 210004834, Avg. loss: 0.513766
Total training time: 79.03 seconds.
-- Epoch 138
Norm: 28.38, NNZs: 267036, Bias: 0.292357, T: 211537716, Avg. loss: 0.513766
Total training time: 79.58 seconds.
-- Epoch 139
Norm: 28.38, NNZs: 267036, Bias: 0.292363, T: 213070598, Avg. loss: 0.513765
Total training time: 80.13 seconds.
-- Epoch 140
Norm: 28.38, NNZs: 267036, Bias: 0.292353, T: 214603480, Avg. loss: 0.513765
Total training time: 80.69 seconds.
-- Epoch 141
Norm: 28.38, NNZs: 267036, Bias: 0.292378, T: 216136362, Avg. loss: 0.513765
Total training time: 81.25 seconds.
-- Ep

Norm: 28.38, NNZs: 267036, Bias: 0.292418, T: 305043518, Avg. loss: 0.513764
Total training time: 114.99 seconds.
-- Epoch 200
Norm: 28.38, NNZs: 267036, Bias: 0.292412, T: 306576400, Avg. loss: 0.513764
Total training time: 115.61 seconds.
Wall time: 3min


VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)), ('mnb', Multinom...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=2, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [46]:
y_pred_em_1 = model_vc_1.predict(X_test)

  if diff:


In [47]:
y_array_em_1 = y_test.as_matrix(columns=None)

  """Entry point for launching an IPython kernel.


In [48]:
same_em_1 = 0
for i in range(y_test.count()):
    if(y_array_em_1[i] == y_pred_em_1[i]):
        same_em_1 = same_em_1+1
same_em_1

25534

In [49]:
print(classification_report(y_test, y_pred_em_1))

             precision    recall  f1-score   support

          0       0.80      0.80      0.80     15868
          1       0.80      0.80      0.80     16067

avg / total       0.80      0.80      0.80     31935



In [50]:
model_vc_1.score(X_test, y_test)

  if diff:


0.7995616095193362

### The accuracy of Voting Classificatin Model is 79.95%

### Now, manually testing the model, of the value of array[0] is 1, then the sentence given in Positive, and if the value of array[0] is 0, then the sentence given is Negative

In [51]:
X_testing = vectorizer.transform(["I am very happy"])

In [52]:
y_pred_testing = model_vc_1.predict(X_testing)

  if diff:


In [53]:
y_pred_testing

array([1], dtype=int64)

In [54]:
X_testing = vectorizer.transform(["I am very sad"])
y_pred_testing = model_vc_1.predict(X_testing)
y_pred_testing

  if diff:


array([0], dtype=int64)

###### Converting the trained Vectorizer and Voting Classifier model into pickle files using joblib.

In [55]:
joblib.dump(vectorizer, "tfidf.pkl", protocol = 2)
joblib.dump(model_vc_1, "mlmodel.pkl", protocol = 2)

['mlmodel.pkl']

###### Now these models can be imported anywhere for getting output.