In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [2]:
train_data = pd.read_csv( 'data_clean.csv' )
train_data = train_data[train_data['text'].notnull()]

In [3]:
train_data.head()

Unnamed: 0,text,sentiment
0,according gran company plans move production r...,neutral
1,technopolis plans develop stages area less squ...,neutral
2,international electronic industry company elco...,negative
3,new production plant company would increase ca...,positive
4,according company updated strategy years baswa...,positive


In [4]:
text = list(train_data['text'])
sentiments = list(train_data['sentiment'])

In [5]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="word"
                             , sublinear_tf=True, ngram_range=(1,3), max_features=10000) 

In [6]:
X = vectorizer.fit_transform(text).toarray()
y = np.array(sentiments)

In [7]:
X.shape

(4845, 10000)

In [8]:
features = vectorizer.get_feature_names()

In [9]:
features

['ab',
 'ab publ',
 'ab sto',
 'abb',
 'ability',
 'able',
 'able deliver',
 'abloy',
 'abn',
 'abn amro',
 'abp',
 'abp hel',
 'abp hel fisas',
 'abp net',
 'abp net profit',
 'abroad',
 'ac',
 'ac cxe',
 'ac cxe amplifier',
 'ac drives',
 'ac drives manufacturer',
 'ac million',
 'ac million million',
 'ac million us',
 'ac million year',
 'ac nielsen',
 'ac nielsen data',
 'ac us',
 'ac us helsinki',
 'acacia',
 'access',
 'accessories',
 'accident',
 'accommodate',
 'accommodate steel',
 'accommodate steel grc',
 'accordance',
 'accordance agreement',
 'accordance chapter',
 'accordance chapter section',
 'accordance section',
 'according',
 'according ac',
 'according ac nielsen',
 'according ceo',
 'according ceo matti',
 'according company',
 'according finnish',
 'according interim',
 'according interim report',
 'according notice',
 'according olvi',
 'according preliminary',
 'according report',
 'according rules',
 'according scanfil',
 'account',
 'accountant',
 'accounted'

In [10]:
RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [11]:
lgs = LogisticRegression(class_weight='balanced') 
lgs.fit(X_train, y_train)
y_pred = lgs.predict(X_eval)
print("Accuracy: %f" % lgs.score(X_eval, y_eval))
print(classification_report(y_eval,y_pred))

Accuracy: 0.754386
              precision    recall  f1-score   support

    negative       0.65      0.59      0.62       113
     neutral       0.79      0.86      0.83       567
    positive       0.71      0.61      0.65       289

    accuracy                           0.75       969
   macro avg       0.72      0.69      0.70       969
weighted avg       0.75      0.75      0.75       969



In [12]:
lgs = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial') 
lgs.fit(X_train, y_train)
y_pred = lgs.predict(X_eval)
print("Accuracy: %f" % lgs.score(X_eval, y_eval))
print(classification_report(y_eval,y_pred))

Accuracy: 0.729618
              precision    recall  f1-score   support

    negative       0.80      0.32      0.46       113
     neutral       0.71      0.97      0.82       567
    positive       0.78      0.43      0.55       289

    accuracy                           0.73       969
   macro avg       0.77      0.57      0.61       969
weighted avg       0.75      0.73      0.70       969



In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000) 

forest.fit( X_train, y_train )
y_pred = forest.predict(X_eval)
print("Accuracy: %f" % forest.score(X_eval, y_eval))
print(classification_report(y_eval,y_pred))

In [None]:
from sklearn.svm import SVC
classifier = SVM(kernel = 'linear')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_eval)
print("Accuracy: %f" % classifier.score(X_eval, y_eval))
print(classification_report(y_eval,y_pred))