In [29]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/eng_headlines_60.csv')

# checking for null values in our dataset
df.isnull().sum()

Date         0
Title        0
sentiment    0
dtype: int64

In [30]:
df['sentiment'].value_counts()

NEGATIVE    47835
POSITIVE    41828
Name: sentiment, dtype: int64

In [31]:
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenizer(text):
  token = [token.lemma_ for token in nlp(text)]
  return ' '.join(token)

In [32]:
df['preprocessed_text'] = df['Title'].apply(tokenizer)

# Preview the preprocessed data
print(df[['Title', 'preprocessed_text']].head())

                                               Title  \
0  Eliminating shadow economy to have positive im...   
1  Two Chinese companies hit roadblock with India...   
2                SoftBank India Vision gets new $100   
3  Nissan halts joint development of luxury cars ...   
4  Despite challenges Rajasthan continues to prog...   

                                   preprocessed_text  
0  eliminate shadow economy to have positive impa...  
1  two chinese company hit roadblock with indian ...  
2                SoftBank India Vision get new $ 100  
3  Nissan halt joint development of luxury car wi...  
4  despite challenge Rajasthan continue to progre...  


In [33]:
# selecting features
X = df[['preprocessed_text']]
y = df[['sentiment']]

In [34]:
from sklearn.model_selection import train_test_split

# splitting our data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 45)

# checking our train, test data to see for any inconsistencies
print('X - Training Data Shape: ', X_train.shape)
print('y - Training Data Shape: ', y_train.shape)
print('X - Testing Data Shape: ', X_test.shape)
print('y - Testing Data Shape: ', y_test.shape)

X - Training Data Shape:  (60074, 1)
y - Training Data Shape:  (60074, 1)
X - Testing Data Shape:  (29589, 1)
y - Testing Data Shape:  (29589, 1)


In [35]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Naive Bayes
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

# Linear SVC
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# Logistic Regression
text_clf_lr_lbfgs = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(solver='lbfgs', max_iter=1000))])
# setting max_iter = 1000 - since it wasn't converging with the default no. of iterations

In [36]:
text_clf_nb.fit(X_train['preprocessed_text'], y_train['sentiment'])

In [37]:
predictions_nb = text_clf_nb.predict(X_test['preprocessed_text'])

In [38]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test['sentiment'], predictions_nb))

[[12753  3025]
 [ 4825  8986]]


In [39]:
print(metrics.accuracy_score(y_test['sentiment'], predictions_nb))
print('\n')
print(metrics.classification_report(y_test['sentiment'], predictions_nb))

0.7346987056000541


              precision    recall  f1-score   support

    NEGATIVE       0.73      0.81      0.76     15778
    POSITIVE       0.75      0.65      0.70     13811

    accuracy                           0.73     29589
   macro avg       0.74      0.73      0.73     29589
weighted avg       0.74      0.73      0.73     29589



In [40]:
#text_clf_lsvc.fit(X_train['Title'], y_train['sentiment'])
text_clf_lsvc.fit(X_train['preprocessed_text'], y_train['sentiment'])


In [41]:
predictions_lsvc = text_clf_lsvc.predict(X_test['preprocessed_text'])

In [42]:
print(metrics.confusion_matrix(y_test['sentiment'], predictions_lsvc))

[[12271  3507]
 [ 3441 10370]]


In [43]:
print(metrics.classification_report(y_test['sentiment'], predictions_lsvc))

              precision    recall  f1-score   support

    NEGATIVE       0.78      0.78      0.78     15778
    POSITIVE       0.75      0.75      0.75     13811

    accuracy                           0.77     29589
   macro avg       0.76      0.76      0.76     29589
weighted avg       0.77      0.77      0.77     29589



In [44]:
print(metrics.accuracy_score(y_test['sentiment'], predictions_lsvc))

0.7651830071986211


In [45]:
text_clf_lr_lbfgs.fit(X_train['preprocessed_text'], y_train['sentiment'])

In [46]:
predictions_lr_lbfgs = text_clf_lr_lbfgs.predict(X_test['preprocessed_text'])

In [47]:
print(metrics.confusion_matrix(y_test['sentiment'], predictions_lr_lbfgs))

[[12355  3423]
 [ 3496 10315]]


In [48]:
print(metrics.classification_report(y_test['sentiment'], predictions_lr_lbfgs))

              precision    recall  f1-score   support

    NEGATIVE       0.78      0.78      0.78     15778
    POSITIVE       0.75      0.75      0.75     13811

    accuracy                           0.77     29589
   macro avg       0.77      0.76      0.77     29589
weighted avg       0.77      0.77      0.77     29589



In [49]:
print(metrics.accuracy_score(y_test['sentiment'], predictions_lr_lbfgs))

0.7661631011524553


In [None]:
# Logistic Regression using a different solver - saga. By default, LR uses lbfgs.

# lbfgs - Limited Memory BFGS
# saga - Stochastic Average Gradient Descent

text_clf_lr_saga = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression(solver='saga'))])

In [None]:
text_clf_lr_saga.fit(X_train['preprocessed_text'], y_train['sentiment'])

In [None]:
predictions_lr_saga = text_clf_lr_saga.predict(X_test['preprocessed_text'])

In [None]:
print(metrics.confusion_matrix(y_test['sentiment'], predictions_lr_saga))

[[12352  3426]
 [ 3495 10316]]


In [None]:
print(metrics.classification_report(y_test['sentiment'], predictions_lr_saga))

              precision    recall  f1-score   support

    NEGATIVE       0.78      0.78      0.78     15778
    POSITIVE       0.75      0.75      0.75     13811

    accuracy                           0.77     29589
   macro avg       0.77      0.76      0.76     29589
weighted avg       0.77      0.77      0.77     29589



In [None]:
print(metrics.accuracy_score(y_test['sentiment'], predictions_lr_saga))

0.766095508465984
