In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/eng_headlines_60.csv')

# checking for null values in our dataset
df.isnull().sum()

df.head()

Unnamed: 0,Date,Title,sentiment
0,2017-01-05,Eliminating shadow economy to have positive im...,POSITIVE
1,2017-01-05,Two Chinese companies hit roadblock with India...,NEGATIVE
2,2017-01-05,SoftBank India Vision gets new $100,POSITIVE
3,2017-01-05,Nissan halts joint development of luxury cars ...,NEGATIVE
4,2017-01-05,Despite challenges Rajasthan continues to prog...,POSITIVE


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenizer(text):
  token = [token.lemma_ for token in nlp(text)]
  return ' '.join(token)

In [None]:
df['preprocessed_text'] = df['Title'].apply(tokenizer)

# Preview the preprocessed data
print(df[['Title', 'preprocessed_text']].head())


                                               Title  \
0  Eliminating shadow economy to have positive im...   
1  Two Chinese companies hit roadblock with India...   
2                SoftBank India Vision gets new $100   
3  Nissan halts joint development of luxury cars ...   
4  Despite challenges Rajasthan continues to prog...   

                                   preprocessed_text  
0  eliminate shadow economy to have positive impa...  
1  two chinese company hit roadblock with indian ...  
2                SoftBank India Vision get new $ 100  
3  Nissan halt joint development of luxury car wi...  
4  despite challenge Rajasthan continue to progre...  


In [None]:
# selecting features
X = df[['preprocessed_text']]
y = df[['sentiment']]

In [None]:
from sklearn.model_selection import train_test_split

# splitting our data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 45)

# checking our train, test data to see for any inconsistencies
print('X - Training Data Shape: ', X_train.shape)
print('y - Training Data Shape: ', y_train.shape)
print('X - Testing Data Shape: ', X_test.shape)
print('y - Testing Data Shape: ', y_test.shape)

X - Training Data Shape:  (60074, 1)
y - Training Data Shape:  (60074, 1)
X - Testing Data Shape:  (29589, 1)
y - Testing Data Shape:  (29589, 1)


In [None]:
#GridSearchCV on LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Define a pipeline with TfidfVectorizer and LinearSVC
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC(max_iter = 10000))
])

# Define parameter grid
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__C': [0.1, 1, 10],
    'clf__loss': ['hinge', 'squared_hinge']
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)

# Train the model with grid search
grid_search.fit(X_train['preprocessed_text'], y_train['sentiment'])    #change this to preprocessed_text

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'clf__C': 0.1, 'clf__loss': 'squared_hinge', 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best Score: 0.7691680693511891


In [None]:
predictions_gs = grid_search.predict(X_test['preprocessed_text']) #change this to preprocessed_text

In [None]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test['sentiment'], predictions_gs))
print('\n')
print(metrics.accuracy_score(y_test['sentiment'], predictions_gs))

[[12409  3369]
 [ 3370 10441]]


0.7722464429348744


In [None]:
#GridSearchCV on logistic Regression

from sklearn.linear_model import LogisticRegression

# Define a pipeline with TfidfVectorizer and LogisticRegression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='lbfgs', max_iter=1000))
])

# Define parameter grid for Logistic Regression
param_grid_lr = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__C': [0.1, 1, 10],
}

# Perform grid search with 5-fold cross-validation
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, n_jobs=-1)

# Train the model with grid search
grid_search_lr.fit(X_train['preprocessed_text'], y_train['sentiment'])

# Print the best parameters and best score for Logistic Regression
print("Best Parameters (Logistic Regression):", grid_search_lr.best_params_)
print("Best Score (Logistic Regression):", grid_search_lr.best_score_)


Best Parameters (Logistic Regression): {'clf__C': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best Score (Logistic Regression): 0.7690515455647147


In [None]:
predictions_lr = grid_search_lr.predict(X_test['preprocessed_text']) #change this to preprocessed_text

In [None]:
print(metrics.confusion_matrix(y_test['sentiment'], predictions_lr))
print('\n')
print(metrics.accuracy_score(y_test['sentiment'], predictions_lr))

[[12436  3342]
 [ 3401 10410]]


0.7721112575619318


In [None]:
#GridSearchCV on Naive Bayes

from sklearn.naive_bayes import MultinomialNB

# Define a pipeline with TfidfVectorizer and MultinomialNB
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define parameter grid for Naive Bayes
param_grid_nb = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__alpha': [0.1, 1, 10],
}

# Perform grid search with 5-fold cross-validation
grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, n_jobs=-1)

# Train the model with grid search
grid_search_nb.fit(X_train['preprocessed_text'], y_train['sentiment'])

# Print the best parameters and best score for Naive Bayes
print("Best Parameters (Naive Bayes):", grid_search_nb.best_params_)
print("Best Score (Naive Bayes):", grid_search_nb.best_score_)


Best Parameters (Naive Bayes): {'clf__alpha': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best Score (Naive Bayes): 0.7400706292097421


In [None]:
predictions_nb = grid_search_nb.predict(X_test['preprocessed_text']) #change this to preprocessed_text

In [None]:

print(metrics.confusion_matrix(y_test['sentiment'], predictions_nb))
print('\n')
print(metrics.accuracy_score(y_test['sentiment'], predictions_nb))

[[12118  3660]
 [ 3975  9836]]


0.7419649193957214


In [None]:
# using the ensemble method --- Voting Classifier

from sklearn.ensemble import VotingClassifier

# Define the base estimators using the best models obtained from grid search
svc_estimator = grid_search.best_estimator_
nb_estimator = grid_search_nb.best_estimator_
lr_estimator = grid_search_lr.best_estimator_

# Create a voting classifier instance
voting_classifier = VotingClassifier(
    estimators=[
        ('svc', svc_estimator),
        ('nb', nb_estimator),
        ('lr', lr_estimator)
    ],
    voting='hard'  # Use majority voting
)

# Fit the voting classifier on the training data
voting_classifier.fit(X_train['preprocessed_text'], y_train['sentiment'])

In [None]:
predictions_vc = voting_classifier.predict(X_test['preprocessed_text'])

In [None]:
print(metrics.confusion_matrix(y_test['sentiment'], predictions_vc))

[[12416  3362]
 [ 3389 10422]]


In [None]:
# Evaluate the performance of the voting classifier on the test data
accuracy = voting_classifier.score(X_test['preprocessed_text'], y_test['sentiment'])
print("Accuracy of the voting classifier:", accuracy)

Accuracy of the voting classifier: 0.7718408868160465


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test['sentiment'], predictions_vc))

              precision    recall  f1-score   support

    NEGATIVE       0.79      0.79      0.79     15778
    POSITIVE       0.76      0.75      0.76     13811

    accuracy                           0.77     29589
   macro avg       0.77      0.77      0.77     29589
weighted avg       0.77      0.77      0.77     29589

