In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv('C:\\Users\\guilh_000\Documents\\tweetsclassify\\train_tweet.csv')
test = pd.read_csv('C:\\Users\\guilh_000\\Documents\\tweetsclassify\\test_tweets.csv')

print(train.shape)
print(test.shape)

(31962, 3)
(17197, 2)


In [3]:
from textblob import TextBlob
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J":'a',"N":'n',"V":'v',"R":'r'}
    words_and_tags = [(w, tag_dict.get(pos[0],'n'))for w, pos in sent.tags]
    lemmatized_list = [wd.lemmatize(tag)for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

import string
import nltk
from nltk.corpus import stopwords
import re
def normalizer(tweet):
    punct = list(string.punctuation)
    stop_words = stopwords.words('english')
    additional_stop_words = ['RT','rt','via','...','http','twitpic','tinyurl','www','amp']
    stopword_list = punct + stop_words + additional_stop_words
    
    tweet = re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet)
    tweet_ = re.sub("(\w+:\/\/\S+)", " ", tweet)
    tweet__ = re.sub("[^a-zA-Z]", " ", tweet_)
    lemmatized = lemmatize_with_postag(tweet__)
    tokens = nltk.word_tokenize(lemmatized)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stopword_list, lower_case))
    return filtered_result

In [4]:
train['normalized'] = train.tweet.apply(normalizer)
train['normal'] = [' '.join(map(str, l)) for l in train['normalized']]

In [5]:
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,2))
vectorized_data =count_vectorizer.fit_transform(train['normal'])
indexed_data =hstack((np.array(range(0,vectorized_data.shape[0]))[:,None],vectorized_data))

In [6]:
targets = train.iloc[:, 1]

from sklearn.model_selection import train_test_split 
data_train, data_test, targets_train, targets_test =train_test_split(indexed_data, targets, test_size=0.25,random_state=0)
data_train_index = data_train[:,0]
data_train = data_train[:,1:]
data_test_index = data_test[:,0]
data_test = data_test[:,1:]

In [7]:
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp.fit(data_train,targets_train)

MLPClassifier(hidden_layer_sizes=(8, 8, 8), max_iter=500)

In [8]:
y_pred = mlp.predict(data_test)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(targets_test,y_pred))
print(classification_report(targets_test,y_pred))
print(accuracy_score(targets_test, y_pred))

[[7277  183]
 [ 295  236]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      7460
           1       0.56      0.44      0.50       531

    accuracy                           0.94      7991
   macro avg       0.76      0.71      0.73      7991
weighted avg       0.93      0.94      0.94      7991

0.9401827055437367


In [7]:
from imblearn.over_sampling import SMOTE
#import numpy as np
sm = SMOTE(random_state=12)
data_train_res, targets_train_res = sm.fit_sample(data_train, targets_train)
print (targets_train.value_counts() , np.bincount(targets_train_res))

0    22260
1     1711
Name: label, dtype: int64 [22260 22260]


In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=500)
mlp.fit(data_train,targets_train)

In [15]:
targets_pred = mlp.predict(data_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(targets_test,targets_pred))
print(classification_report(targets_test,targets_pred))
print(accuracy_score(targets_test, targets_pred))

[[7460    0]
 [ 531    0]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      7460
           1       0.00      0.00      0.00       531

    accuracy                           0.93      7991
   macro avg       0.47      0.50      0.48      7991
weighted avg       0.87      0.93      0.90      7991

0.9335502440245276


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
#########################################################################################################################
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=0)
gb.fit(data_train, targets_train)
print("Accuracy on training set: {:.3f}".format(gb.score(data_train, targets_train)))
print("Accuracy on test set: {:.3f}".format(gb.score(data_test, targets_test)))

Accuracy on training set: 0.941
Accuracy on test set: 0.943


In [12]:
gb_pred = gb.predict(data_test)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(targets_test,gb_pred))
print(classification_report(targets_test,gb_pred))
print(accuracy_score(targets_test, gb_pred))

[[7444   16]
 [ 438   93]]
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      7460
           1       0.85      0.18      0.29       531

    accuracy                           0.94      7991
   macro avg       0.90      0.59      0.63      7991
weighted avg       0.94      0.94      0.93      7991

0.943186084344888
