In [1]:
from collections import namedtuple
Sentiment = namedtuple('Sentiment', ['type','ordinal',])

Approach = namedtuple('Approach',['binary','counts', 'tfidf','df_y']) ## Наши подходы

Classifier = namedtuple('Classifier', ['model','params'])

approaches = {
    'tokenization': None,
    'stemming': None,
    'lemmatization': None,
    's+m': None,
    'l+m': None,
}

sentiments = {'negative': Sentiment('negative', -1), 'positive': Sentiment('positive', 1),'neutral': Sentiment('neutral', 0) }


<div class="foo">

| approach | 0 or 1, if the word exists | word counts | TFIDF |
| --- | --- | --- | --- |
| Just tokenization |  | |  |
| Stemming |  | |  |
| Lemmatization |  | |  |
| Stemming + Misspellings |  | |  |
| Lemmatization + Misspellings |  | |  |
| Any other ... |  | |  |


 
</div>

## Just Tokenizer

In [2]:
import pandas as pd
from src.Parser import ParserCsv
from src.TextProccesing import Processor
from src.TextProccesing import TokenizerBase
from src.TextProccesing import PreProcessor
from src.TextProccesing import binarize
from src.TFIDProcces import TFIDFProcessor

######
from src.Steammer import Stemmer

In [3]:
negative_tweets = ParserCsv.parse('./data/processedNegative.csv')
positive_tweets = ParserCsv.parse('./data/processedPositive.csv')
neutral_tweets = ParserCsv.parse('./data/processedNeutral.csv')


neg_token = TokenizerBase(records=negative_tweets, binary_value = -1).process_all()
pos_token = TokenizerBase(records=positive_tweets, binary_value = 1).process_all()
neu_token = TokenizerBase(records=neutral_tweets, binary_value = 0).process_all()

all_df = neg_token + pos_token + neu_token
df_token = pd.DataFrame(all_df)

preprocessor = PreProcessor(df_token) ##Грубо говоря очистка
preprocessor.execute()


In [4]:

tfidf = TFIDFProcessor(preprocessor.df_x) ##  Получаю матрицу с кф встречаемости
res_tfdif = tfidf.compute_tfidf()

In [5]:
res_tfdif ##Базовое числовое представление

Unnamed: 0,how,unhappy,some,dogs,like,it,though,talking,to,my,...,vikram,limaye,diana,edulji,cag,4member,amulya,appointed,agmut,cadre
0,22113,10040,27601,40102,23440,19001,34845,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,10038,8929,5263,...,0,0,0,0,0,0,0,0,0,0
2,0,2510,6900,0,0,4750,0,0,5102,0,...,0,0,0,0,0,0,0,0,0,0
3,0,8785,0,0,0,0,0,0,8929,0,...,0,0,0,0,0,0,0,0,0,0
4,0,10040,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,89633,89633,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,7143,0,...,0,0,35853,35853,0,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,5102,0,...,0,0,0,0,25609,25609,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#approaches
approaches['tokenization'] = Approach(binarize(preprocessor.df_x), preprocessor.df_x, res_tfdif, preprocessor.df_y)

## Stemming

In [4]:
neg_token = TokenizerBase(records=negative_tweets, binary_value = -1, next_pipeline=Stemmer(records = negative_tweets)).process_all()
pos_token = TokenizerBase(records=positive_tweets, binary_value = 1, next_pipeline=Stemmer(records = positive_tweets)).process_all()
neu_token = TokenizerBase(records=neutral_tweets, binary_value = 0, next_pipeline=Stemmer(records = neutral_tweets)).process_all()

df_stemmed = pd.DataFrame(neg_token + pos_token + neu_token)
pp_stemmed = PreProcessor(df_stemmed)
pp_stemmed.execute()

In [5]:
pp_stemmed.df_x

Unnamed: 0,how,unhappi,some,dog,like,it,though,talk,to,my,...,idfc,vikram,limay,diana,edulji,cag,4member,amulya,agmut,cadr
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
tfidf = TFIDFProcessor(pp_stemmed.df_x)
approaches['stemming'] = Approach(binarize(pp_stemmed.df_x), pp_stemmed.df_x, tfidf.compute_tfidf(), pp_stemmed.df_y)

## Lemmatization

In [6]:
from src.Lemmatize import Lemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ifanzilka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
neg_token_lem = TokenizerBase(records=negative_tweets, binary_value = -1, next_pipeline=Lemmatizer(records = negative_tweets)).process_all()
pos_token_lem = TokenizerBase(records=positive_tweets, binary_value = 1, next_pipeline=Lemmatizer(records = positive_tweets)).process_all()
neu_token_lem = TokenizerBase(records=neutral_tweets, binary_value = 0, next_pipeline=Lemmatizer(records = neutral_tweets)).process_all()

all_lemmed = pd.DataFrame(neg_token_lem + pos_token_lem + neu_token_lem)
pp_lemmed = PreProcessor(all_lemmed)
pp_lemmed.execute()

In [8]:
pp_lemmed.df_x

Unnamed: 0,how,unhappy,some,dog,like,it,though,talking,to,my,...,vikram,limaye,diana,edulji,cag,4member,amulya,appointed,agmut,cadre
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
3870,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
tfidf_lemmed = TFIDFProcessor(pp_lemmed.df_x)

approaches['lemmatization'] = Approach(binarize(pp_lemmed.df_x), pp_lemmed.df_x, tfidf_lemmed.compute_tfidf(), pp_lemmed.df_y)

## Misspelling


In [9]:
from src.Corrector import MisspellingsCorrector

In [10]:
neg_token_corr = TokenizerBase(records=negative_tweets, binary_value = -1, next_pipeline=Stemmer(records = negative_tweets, next_pipeline=MisspellingsCorrector(records=negative_tweets))).process_all()
pos_token_corr = TokenizerBase(records=positive_tweets, binary_value = 1, next_pipeline=Stemmer(records = positive_tweets, next_pipeline=MisspellingsCorrector(records=positive_tweets))).process_all()
neu_token_corr = TokenizerBase(records=neutral_tweets, binary_value = 0, next_pipeline=Stemmer(records = neutral_tweets, next_pipeline=MisspellingsCorrector(records=neutral_tweets))).process_all()

all_stemmed_corrected = pd.DataFrame(neg_token_corr + pos_token_corr + neu_token_corr)
pp_stemmed_corrected = PreProcessor(all_stemmed_corrected)
pp_stemmed_corrected.execute()

In [11]:
pp_stemmed_corrected.df_x

Unnamed: 0,how,unhappi,some,dog,like,it,though,talk,to,my,...,attain,martyrdom,dept,hoarder,payment,rs25000,historian,diana,gamut,cadr
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,1,3,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,1,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3869,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3870,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3871,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:

tfidf_stemmed_corrected = TFIDFProcessor(pp_stemmed_corrected.df_x)
approaches['s+m'] = Approach(binarize(pp_stemmed_corrected.df_x), pp_stemmed_corrected.df_x, tfidf_stemmed_corrected.compute_tfidf(), pp_stemmed_corrected.df_y)

# Machine Lerning

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

### Simple

In [13]:
X_train, X_test, y_train, y_test = train_test_split(approaches['tokenization'].counts, approaches['tokenization'].df_y.sentiment, test_size=0.5)


#### LogisticRegression

In [14]:
lr = LogisticRegression(max_iter=1000, solver='newton-cg')
lr.fit(X_train, y_train)
accuracy_score(lr.predict(X_test), y_test)

0.9012987012987013

#### KNeighborsClassifier

In [15]:
knn = KNeighborsClassifier(7)
knn.fit(X_train, y_train)
accuracy_score(knn.predict(X_test), y_test)

0.8077922077922078

#### RandomForestClassifier

In [16]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
accuracy_score(rfc.predict(X_test), y_test)

0.8935064935064935

### Advanced

In [24]:
models = {
    "logistic": Classifier(LogisticRegression, {"C": [1.0, 2.0, 0.5, 0.25], "solver": ('newton-cg', 'sag', 'saga'), "max_iter": [500]}),
    "randomforest": Classifier(RandomForestClassifier, dict(n_estimators = [100, 300, 500], max_depth = [ 25, 30], min_samples_split = [2, 5], min_samples_leaf = [1, 2])),
    "knn": Classifier(KNeighborsClassifier, dict(n_neighbors=range(2,7), algorithm=['ball_tree', 'kd_tree', 'auto'])),
    "decisiontree": Classifier(DecisionTreeClassifier, dict(max_features=['sqrt', 'log2', None], criterion=["gini", "entropy"], min_samples_split=[2,3,4]))
    }

def optimize_model_params(classifier: Classifier, x_train, y_train):
    gs = GridSearchCV(classifier.model(), param_grid=classifier.params, n_jobs=-1)
    gs.fit(x_train, y_train)
    return gs.best_params_, gs.best_score_

def find_best_model(df_x, df_y):
    X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.3)
    max_accuracy = 0
    best_model = None
    for name, model in models.items():
        print(f'optimizing {name}')
        best_params, best_accuracy = optimize_model_params(model, X_train, y_train)
        print(f'Best accuracy {best_accuracy} for model: {name}\nBest params: {best_params}')
        if best_accuracy > max_accuracy:

            max_accuracy = best_accuracy
            best_model = Classifier(model.model, best_params)
    return best_model

In [25]:
trained_models = {}
for name, approach in approaches.items():
    print(f'Approach {name}')
    if approach is not None:
        trained_models[name] = find_best_model(approach.counts, approach.df_y.sentiment)
    print()

Approach tokenization
optimizing logistic
Best accuracy 0.8968094571387191 for model: logistic
Best params: {'C': 2.0, 'max_iter': 500, 'solver': 'newton-cg'}
optimizing randomforest
Best accuracy 0.881593340276293 for model: randomforest
Best params: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
optimizing knn
Best accuracy 0.8125462959769916 for model: knn
Best params: {'algorithm': 'auto', 'n_neighbors': 5}
optimizing decisiontree
Best accuracy 0.8812174548765095 for model: decisiontree
Best params: {'criterion': 'entropy', 'max_features': None, 'min_samples_split': 2}

Approach stemming

Approach lemmatization

Approach s+m

Approach l+m



In [27]:
trained_models2 = {}
for name, approach in approaches.items():
    print(f'Approach {name}')
    if approach is not None:
        trained_models2[name] = find_best_model(approach.tfidf, approach.df_y.sentiment)
    print()


Approach tokenization
optimizing logistic




KeyboardInterrupt: 