In [1]:
import sys

import csv
import pandas as pd
import numpy as np

import joblib
import pickle

import sklearn.ensemble
import sklearn.metrics

from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
# Logistic Regression

In [3]:
train = pd.read_csv('train_tweets_full_2.csv',header=(0))
train.dropna()
X = list(train["tweets"])
y = list(train["maj_label"])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
from sklearn import metrics
def print_report(pipe):
    y_actuals = y_test
    y_preds = pipe.predict(X_test)
    report = metrics.classification_report(y_actuals, y_preds)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_actuals, y_preds)))

In [5]:
model = joblib.load('model_LR2.pkl')
print_report(model)



              precision    recall  f1-score   support

  derogatory       0.88      0.52      0.66      1470
      normal       0.77      0.99      0.87      7737
        spam       1.00      0.02      0.04      1654

    accuracy                           0.78     10861
   macro avg       0.88      0.51      0.52     10861
weighted avg       0.82      0.78      0.71     10861

accuracy: 0.779


In [6]:
# get overall weights
vec = model.steps[0][1]
clf = model.steps[2][1]
for i, tag in enumerate(clf.classes_):
    coefficients = clf.coef_[i]
    weights = list(zip(vec.get_feature_names(),coefficients))
    print('Tag:',tag)
    print('Most Positive Coefficients:')
    print(sorted(weights,key=lambda x: -x[1])[:10])
    print('Most Negative Coefficients:')
    print(sorted(weights,key=lambda x: x[1])[:10])
    print("--------------------------------------")

Tag: derogatory
Most Positive Coefficients:
[('idiot', 8.384144029045096), ('idiots', 7.16733446491027), ('fucking', 6.952766004423737), ('rt', 6.90935999053271), ('bad', 4.40000746260379), ('retarded', 4.271009053181252), ('ass', 4.067765785243252), ('fucked', 3.7938526342865186), ('hell', 3.7100983874893148), ('damn', 3.574748098582929)]
Most Negative Coefficients:
[('good', -1.6700954791900573), ('new', -1.377742164297901), ('mean', -1.32774905547556), ('enough', -1.2710794438477495), ('late', -1.2022169775036722), ('co', -1.1872949976325533), ('down', -1.174563897019097), ('only', -1.1672710682580456), ('road', -1.1558058122352879), ('own', -1.1461722699491044)]
--------------------------------------
Tag: normal
Most Positive Coefficients:
[('characters', 1.2098778652217332), ('nice', 1.2097139247327509), ('100', 1.1986169771008859), ('happen', 1.123610239969527), ('thoughts', 1.0906178241629014), ('ai', 1.0535383645544178), ('premium', 1.0434694620165041), ('player', 1.03034262843

In [7]:
# get weights for individual examples

y_preds = model.predict(X)
train['predicted_label'] = y_preds

def get_weight_one_sample(words):
    for i, tag in enumerate(clf.classes_):
        pad = "    "
        coefficients = clf.coef_[i]
        weights = list(zip(vec.get_feature_names(),coefficients))
        print('Tag:',tag)
        weights_sorted = sorted(weights,key=lambda x: -x[1])
        weights_sorted = dict(weights_sorted)
        overall_weight = 0
        for i in words:
            val = weights_sorted.get(i,0)
            if val is not 0:
                print(pad + str(i) + ', ' + str(val))
            overall_weight = overall_weight + val
        pad = "  "
        print(pad + "overall weight: " + str(overall_weight))
    print('*')

# spam
idx = 5089
tweet = X[idx]
label = y[idx]
print(tweet)
#print(label)
#print(y_preds[idx])
print('*')
get_weight_one_sample(set(tweet.split()))

IAH - Domestico to International flight - I forgot to say, both flights are with United. https://t.co/md2c7Akwfk
*
Tag: derogatory
    forgot, -0.16791344370801273
    flight, -0.3170312719761709
    with, -0.2577867526046609
    to, -0.6043078524476211
    both, -0.04463652152194795
    are, 0.4316134478454083
    flights, 0.2591054852021373
  overall weight: -0.7009569092108678
Tag: normal
    forgot, 0.04226449110022955
    flight, -0.44716547791401795
    with, -0.21921179098080978
    to, 0.2953410223317563
    both, -0.2951991403242208
    are, -0.33236651798375993
    flights, -0.5554873792742393
  overall weight: -1.511824793045062
Tag: spam
    forgot, 0.1256489526077836
    flight, 0.7641967498901909
    with, 0.4769985435854655
    to, 0.3089668301158796
    both, 0.339835661846169
    are, -0.0992469298616439
    flights, 0.29638189407210286
  overall weight: 2.2127817022559477
*


In [8]:
# derogatory
idx = 108#14
tweet = X[idx]
label = y[idx]
print(tweet)
print(label)
get_weight_one_sample(set(tweet.split()))
# normal
idx = 54270
tweet = X[idx]
label = y[idx]
print(tweet)
print(label)
get_weight_one_sample(set(tweet.split()))

It's always the filthy bitch that comes in the picture.. 🤦🏽‍♂️
derogatory
Tag: derogatory
    bitch, 3.01906509545287
    comes, 0.05109874550810478
    that, -0.006718444559540498
    always, 0.11040574296039389
    in, -0.6608141716450816
    filthy, 1.632728222328763
    the, -0.008964799218398491
  overall weight: 4.13680039082711
Tag: normal
    bitch, -1.5017129556538038
    comes, -0.1380461187258348
    that, -0.20459392079932168
    always, 0.3497067652373991
    in, 0.4283272847299151
    filthy, -1.309619791495735
    the, 0.07404841429734252
  overall weight: -2.3018903224100384
Tag: spam
    bitch, -1.517352139799081
    comes, 0.08694737321773033
    that, 0.2113123653588483
    always, -0.46011250819779775
    in, 0.23248688691514086
    filthy, -0.32310843083302915
    the, -0.06508361507895734
  overall weight: -1.8349100684171458
*
one person followed me and one person unfollowed me // automatically checked by https://t.co/J66pWkCLxT
normal
Tag: derogatory
    me, 0.5

In [9]:
# let's see what happens when we retrain things on misclassified examples

y_preds = model.predict(X)
train['predicted_label'] = y_preds
normal_examples = train[((train['maj_label']=="spam"))&(train['predicted_label']=="spam")]
print(normal_examples)
misclassified_examples = train[(train['maj_label']!=train['predicted_label'])]
print(misclassified_examples)

                 tweet_id maj_label  \
1339   849770809681956864      spam   
4990   848926336890662914      spam   
5089   847877626698092545      spam   
5439   850468238525964288      spam   
5484   847467465697239044      spam   
5539   849488700786573312      spam   
5625   850703639656247300      spam   
5956   849924774205771778      spam   
6265   848996843149287427      spam   
6723   848154865973047296      spam   
6922   850455139752325120      spam   
7066   849141735376785409      spam   
7450   847403544479096833      spam   
8833   849551032338272256      spam   
9122   849799918134886400      spam   
9521   850153506354978816      spam   
9914   848792286930620416      spam   
10202  849564261181517827      spam   
10727  849027507709980675      spam   
11006  848881705289166848      spam   
11019  849907380397768704      spam   
11071  849696054589239297      spam   
11161  848148167682076672      spam   
11445  847542921238900736      spam   
11547  850732546824577025

In [10]:
X = list(misclassified_examples["tweets"])
y = list(misclassified_examples["maj_label"])
y_false = list(misclassified_examples["predicted_label"])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)
model.fit(X_train, y_train)

y_preds = model.predict(X_test)
report = metrics.classification_report(y_test, y_preds)
print(report)
print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_preds)))

              precision    recall  f1-score   support

  derogatory       0.28      0.03      0.06       659
      normal       0.84      0.61      0.70        76
        spam       0.71      0.97      0.82      1688

    accuracy                           0.70      2423
   macro avg       0.61      0.54      0.53      2423
weighted avg       0.60      0.70      0.61      2423

accuracy: 0.702


In [11]:
vec = model.steps[0][1]
clf = model.steps[2][1]
for i, tag in enumerate(clf.classes_):
    coefficients = clf.coef_[i]
    weights = list(zip(vec.get_feature_names(),coefficients))
    print('Tag:',tag)
    print('Most Positive Coefficients:')
    print(sorted(weights,key=lambda x: -x[1])[:10])
    print('Most Negative Coefficients:')
    print(sorted(weights,key=lambda x: x[1])[:10])
    print("--------------------------------------")

Tag: derogatory
Most Positive Coefficients:
[('stand', 1.0047159519820077), ('thank', 0.9793222323146172), ('without', 0.9648489503886257), ('american', 0.9189451173753155), ('coffee', 0.8697660132374775), ('rising', 0.8662864873397012), ('pleasure', 0.8536614824107929), ('2017', 0.8356997973590333), ('makes', 0.8251379520503926), ('get', 0.7859521885878996)]
Most Negative Coefficients:
[('rt', -3.895117470621849), ('fucking', -2.876284654404537), ('idiot', -2.49240419901193), ('ass', -1.6542050893435412), ('idiots', -1.2280074791300748), ('hate', -1.1618588277728645), ('bad', -0.9450558081252478), ('bitch', -0.9449889086660689), ('ugly', -0.902504784788593), ('stupid', -0.8229468351551408)]
--------------------------------------
Tag: normal
Most Positive Coefficients:
[('rt', 8.02871269458181), ('fucking', 6.21538492047141), ('idiot', 5.256941111927054), ('ass', 3.9007992192050778), ('bad', 2.8838996954492897), ('idiots', 2.812234781895666), ('hell', 2.704070663503978), ('hate', 2.633