In [67]:
# all modules
import random
import datetime
from time import time
from collections import Counter

from nltk.tokenize import regexp_tokenize
import spacy
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit, GridSearchCV

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt
from pymongo import MongoClient

from spellchecker import SpellChecker

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# MongoClient, pymongo, mongodb
Our twitter data is in mongodb. 
In this part, create a MongoClient. Connect to the mongodb with the credentials provided to you. The name of the database is TWEEDY. The name of the collection is Tweet. 
Remember the twitter data is saved in json files. In your db.collection.find, access tweetID, text, and edInput fields. 

In [68]:
## get the data from Mongodb
## setup connection
## The class MongoClient enables you to make successful MongoDB server connections with your code. 
## client instanse should be returned if the connection is successful.
## "mongodb://user:password@example.com/database"
client = MongoClient('mongodb://tweedyRead:...../TWEEDY')

## treat the mongo database like a dictionary key
## Tweedy is the name of the database. Tweet is the collection.
db=client['TWEEDY']

## for a detailed explanation on db.collection.find
## please visit https://docs.mongodb.com/manual/reference/method/db.collection.find/
## look up data to setup the dataframe
## db.bios.find( { }, { "tweetID": 1, "text":1, "edInput":1 } ) 
## The second curly braces include only the parameters to include or exclude (1, 0)
## Unless the _id field is explicitly excluded in the projection document _id: 0, the _id field is returned.
df = pd.DataFrame(list(db.Tweet.find({"edInput":{ "$in": [ 1, 2 ] },"topicName":"Business",
                        "crDate":{"$gt": datetime.datetime(2020, 3, 1, 0, 0, 0),"$lt": datetime.datetime(2020, 4, 4, 0, 0, 0)}},
                        {"_id":0,"tweetID":1,"text":1,"edInput":1} )))

## replace the rejected encoding2, 0 from 2 to 0
df['edInput'].replace(2, 0, inplace=True)

## (***) choosing to keep indexes simple
## set tweetID as index for df
## df.set_index('tweetID',inplace=True)

In [69]:
pd.set_option('display.max_colwidth', 300)

In [70]:
df

Unnamed: 0,tweetID,edInput,text
0,1233904810803507200,1,"It will take good, determined individuals to force our capitalist system to recalibrate before an upheaval. And private-sector leaders should be leading the charge. https://t.co/sx9NGrkmho"
1,1233905301163782145,1,Test your job candidates. You should be pleased with the results. https://t.co/GQLImkoRxj
2,1233905647575535617,1,"Joe Biden, confident of a strong win in South Carolina’s primary, is already looking toward Super Tuesday https://t.co/li98lefYP7"
3,1233907176575508484,1,JUST IN: Joe Biden wins South Carolina primary https://t.co/ZxYRjpzOpa https://t.co/uLKeEicTnf
4,1233909786195959809,1,"Just in: Joe Biden wins the South Carolina primary, making it his first-ever victory in three presidential runs. https://t.co/ouwV1wSWk7 https://t.co/FYOTXpmnZ3"
...,...,...,...
4786,1236053560636268545,0,Austin's SXSW media and music festival is the latest conference in the tech world to be canceled over coronavirus concerns.\n\nHere's a list of all the high-profile tech events that have done the same.\n\nhttps://t.co/dt9qIDeuII
4787,1236057371547496449,0,"Joe Biden's campaign has seen an unprecedented influx of donations and offers to host fundraisers for him following his success on Super Tuesday.\n\nFull story on @businessinsider, via @AP. https://t.co/TV7mZXDKhe"
4788,1236064036938231817,0,"At #BloombergEquality @OneCarlyle Co-Founder David Rubenstein sits down with one of the most powerful and influential women in tech, @Google & @Alphabetlnc CFO Ruth Porat. Request an invitation: https://t.co/zfSwu2m5vn #IWD2020 https://t.co/ckgyeikeNT"
4789,1236282216034054144,0,"The new coronavirus lives on surfaces for ""hours to a day,"" one expert says. Here's how to disinfect them properly. https://t.co/dAAD5hmhtl"


In [None]:
len(df[df.edInput ==1])

In [None]:
len(df[df.edInput ==0])

In [75]:
X = df.iloc[:, 2]
y = df.iloc[:, 1]

In [76]:
X

0                                                                       It will take good, determined individuals to force our capitalist system to recalibrate before an upheaval. And private-sector leaders should be leading the charge. https://t.co/sx9NGrkmho
1                                                                                                                                                                          Test your job candidates. You should be pleased with the results. https://t.co/GQLImkoRxj
2                                                                                                                                  Joe Biden, confident of a strong win in South Carolina’s primary, is already looking toward Super Tuesday https://t.co/li98lefYP7
3                                                                                                                                                                     JUST IN: Joe Biden wins South Carolina primary http

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify =y)

In [15]:
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()

In [16]:
X_test = cv.transform(X_test).toarray()

In [55]:
X_train.shape

(3593, 5610)

In [18]:
X_test.shape

(1198, 13180)

In [19]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [20]:
y_pred = classifier.predict(X_test)

Baseline Accuracy Score:

In [21]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9666110183639399


In [3]:
regexweb = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
regexemail = '^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$'
regexnick = '[@]\w'
regexhashtag = '[#]\w'
corpus = []
for i in range(0, len(df)):
    #print(re.findall(regex, df['text'][i] ))
    review = re.sub(regexweb, ' ', df['text'][i])
    review = re.sub(regexnick, ' ', review)
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    spell = SpellChecker()
    review = [spell.correction(word) for word in review]    
    review = ' '.join(review)
    corpus.append(review)

In [102]:
corpus

['take good determine individual for capitalist system recalibr upheld private sector leader lead charge',
 'test job candid pleas result',
 'joe biden confide strong win south carolina primary already look toward super tuesday',
 'joe biden win south carolina primary',
 'joe biden win south carolina primary make first ever victory three president run',
 'abstract modern art piece sell million dollar auction',
 'watch crowd joe biden south carolina watch parti react former vice preside name primary winner nep read latest',
 'inter precious asset guard every got',
 'today great day full comeback start south carolina joe biden argue strong show scprimari foretell win southern state similar number black voter day super tuesday',
 'dele count stand joe biden denis win south carolina',
 'escape nature glass cabin sweden',
 'joe biden south carolina primary',
 'scene joe biden watch parti former vice preside declare winner southcarolinaprimari',
 'follow number real time president candid del

In [103]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.25, random_state = 0, stratify =y)

In [104]:
cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [105]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [106]:
y_pred = classifier.predict(X_test)

In [107]:
score = accuracy_score(y_test, y_pred)
print(score)

0.9649415692821369


In [108]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.5110948872812549

In [42]:
penalty = ['l1', 'l2']
C = np.logspace(-3, 3, 7)
hyperparameters = dict(C=C, penalty=penalty)
classifier2 = LogisticRegression()
clf = GridSearchCV(classifier2, hyperparameters, cv=5, verbose=1)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_p

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             verbose=1)

In [43]:
clf.best_params_

{'C': 0.001, 'penalty': 'l2'}

In [44]:
best = clf.best_estimator_

In [45]:
y_pred = best.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.9674457429048414


TF-IDF Vectorizor

In [33]:
cv = TfidfVectorizer()
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [35]:
penalty = ['l1', 'l2']
C = np.logspace(-3, 3, 7)
hyperparameters = dict(C=C, penalty=penalty)
classifier3 = LogisticRegression()
clf = GridSearchCV(classifier3, hyperparameters, cv=5, verbose=1)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\user1\.conda\envs\Machine Learning\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_p

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2']},
             verbose=1)

In [36]:
best = clf.best_estimator_
y_pred = best.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.9674457429048414


In [50]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.9666110183639399


In [52]:
from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.9649415692821369


Creating a Pipeline

In [109]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.decomposition import TruncatedSVD
ad = ADASYN()
smt = SMOTE()
model = LogisticRegression(C=0.001)
svdT = TruncatedSVD(n_components=4500)
cv = CountVectorizer()
#X_adasyn, y_adasyn = smt.fit_resample(X, y)

In [110]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.25, random_state = 0, stratify =y)

In [111]:
from imblearn.pipeline import make_pipeline
adatrunclog=make_pipeline(cv, ad, svdT, model)
adatrunclog.fit(X_train, y_train)
y_pred = adatrunclog.predict(X_test)
score = accuracy_score(y_test, y_pred)
print(score)

0.6636060100166945


In [100]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.5792239109754209

In [112]:
accuracy_score(y_test, y_pred)

0.6636060100166945

In [113]:
precision_score(y_test, y_pred)

0.9748743718592965

In [114]:
recall_score(y_test, y_pred)

0.6695427092320967

In [115]:
f1_score(y_test, y_pred)

0.7938618925831202

In [117]:
smotelogisticregression = make_pipeline(cv, smt, svdT, model)
adatrunclog.fit(X_train, y_train)
y_pred = adatrunclog.predict(X_test)
score = accuracy_score(y_test, y_pred)

In [118]:
roc_auc_score(y_test, y_pred)

0.5920444237959337

In [119]:
accuracy_score(y_test, y_pred)

0.666110183639399

In [120]:
precision_score(y_test, y_pred)

0.9761606022584692

In [121]:
recall_score(y_test, y_pred)

0.6712683347713546

In [122]:
f1_score(y_test, y_pred)

0.7955010224948875

In [125]:
model = SGDClassifier(random_state = 0)
smoteSGD= make_pipeline(cv, smt, svdT, model)
smoteSGD.fit(X_train, y_train)
y_pred = smoteSGD.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(score))
print("Presicion: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Acore: {}".format(f1_score(y_test, y_pred)))

Accuracy: 0.9240400667779632
Presicion: 0.9717314487632509
Recall: 0.9490940465918896
F1 Acore: 0.9602793539938892


In [126]:
model = RandomForestClassifier(random_state = 0)
smoteSGD= make_pipeline(cv, smt, svdT, model)
smoteSGD.fit(X_train, y_train)
y_pred = smoteSGD.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(score))
print("Presicion: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Acore: {}".format(f1_score(y_test, y_pred)))

Accuracy: 0.9649415692821369
Presicion: 0.9673640167364017
Recall: 0.997411561691113
F1 Acore: 0.9821580288870009


In [127]:
model = SVC(random_state = 0)
smoteSGD= make_pipeline(cv, smt, svdT, model)
smoteSGD.fit(X_train, y_train)
y_pred = smoteSGD.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(score))
print("Presicion: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Acore: {}".format(f1_score(y_test, y_pred)))

Accuracy: 0.9540901502504173
Presicion: 0.9670050761421319
Recall: 0.9861949956859362
F1 Acore: 0.9765057667663392


In [128]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
smoteSGD= make_pipeline(cv, smt, svdT, model)
smoteSGD.fit(X_train, y_train)
y_pred = smoteSGD.predict(X_test)
score = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(score))
print("Presicion: {}".format(precision_score(y_test, y_pred)))
print("Recall: {}".format(recall_score(y_test, y_pred)))
print("F1 Acore: {}".format(f1_score(y_test, y_pred)))

Accuracy: 0.07679465776293823
Presicion: 0.9344262295081968
Recall: 0.04918032786885246
F1 Acore: 0.09344262295081968


In [129]:
roc_auc_score(y_test, y_pred)

0.473308112652375