In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
#read in the data
df = pd.read_csv('kaggle-training/all-data.csv', header=None,encoding='latin-1')
df.columns = ['sentiment','headline']

df.head(10)

Unnamed: 0,sentiment,headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
5,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
6,positive,"For the last quarter of 2010 , Componenta 's n..."
7,positive,"In the third quarter of 2010 , net sales incre..."
8,positive,Operating profit rose to EUR 13.1 mn from EUR ...
9,positive,"Operating profit totalled EUR 21.1 mn , up fro..."


In [3]:
X = df['headline']
y = df['sentiment']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=50)

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [6]:
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
random_forest_classifier.fit(X_train_vec, y_train)

RandomForestClassifier(class_weight='balanced', random_state=42)

In [7]:
y_pred_rf = random_forest_classifier.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Accuracy: 0.7546391752577319

Classification Report:
               precision    recall  f1-score   support

    negative       0.74      0.43      0.54       121
     neutral       0.76      0.95      0.84       576
    positive       0.75      0.49      0.60       273

    accuracy                           0.75       970
   macro avg       0.75      0.62      0.66       970
weighted avg       0.75      0.75      0.74       970



In [9]:
new_headline = ["microsoft to expand workforce by 20%"]
new_headline_vec = vectorizer.transform(new_headline)
prediction = random_forest_classifier.predict(new_headline_vec)
print(prediction)

['positive']


This is the third model made. I found that like the others, accuracy does not exceed 80%. This is an area to work on in the next sprint, and may account for why so many articles come accross as neutral. For instance when discussing layoffs, hirings, price decrease/increase the model correctly predicts, however at time when a headline such as "apple to close down China plant" is put through the model, it predicts neutral.