In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import src.vectorizer as vectorizer
import src.preprocessing as preprocessing
import re
from textstat.textstat import textstat
from textblob import TextBlob
import seaborn as sns
import string
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
%matplotlib inline

In [2]:
text_only_df = pd.read_csv('data/labels_and_text_only.csv', index_col=0)


In [3]:
def remove_handles(content):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)"," ",content).split())

def count_handles(content):
    return len(re.findall("(@[A-Za-z0-9]+)",content))

def bool_handles(content):
    match = re.search("(@[A-Za-z0-9]+)", content)
    if match:
        return 1
    else: return 0

def count_hashtags(content):
    return len(re.findall("(#[A-Za-z0-9]+)",content))

def bool_hashtags(content):
    match = re.search("(#[A-Za-z0-9]+)", content)
    if match:
        return 1
    else: return 0

def is_retweet(content):
    return int("RT " in content)

def has_url(content):
    return int("https://" in content or "http://" in content)

def build_POS_list(content):
    content = content.decode('latin-1')
    return ' '.join([item[1] for item in pos_tag(word_tokenize(content))])



In [4]:
text_only_df['tweet_no_handle'] = text_only_df['tweet_text'].apply(remove_handles)

## Adding Reading Scores

In [5]:
text_only_df['reading_ease'] = text_only_df['tweet_no_handle'].apply(textstat.flesch_reading_ease)
text_only_df['reading_grade'] = text_only_df['tweet_no_handle'].apply(textstat.flesch_kincaid_grade)

## Adding Sentiment Analysis

In [6]:
text_only_df['sentiment'] = text_only_df['tweet_no_handle'].map(lambda x: TextBlob(x.decode('latin-1')).polarity)
text_only_df['subjectivity'] = text_only_df['tweet_no_handle'].map(lambda x: TextBlob(x.decode('latin-1')).subjectivity)

## Add More Features

In [None]:
text_only_df['mentions_count'] = text_only_df['tweet_text'].apply(count_handles)
text_only_df['mentions_bool'] = text_only_df['tweet_text'].apply(bool_handles)
text_only_df['hashtag_count'] = text_only_df['tweet_text'].apply(count_hashtags)
text_only_df['hashtag_bool'] = text_only_df['tweet_text'].apply(bool_hashtags)
text_only_df['has_url'] = text_only_df['tweet_text'].apply(is_retweet)
text_only_df['tweet_length'] = text_only_df['tweet_no_handle'].apply(len)
text_only_df['word_count'] = text_only_df['tweet_no_handle'].apply(textstat.lexicon_count)
text_only_df['syllable_count'] = text_only_df['tweet_no_handle'].apply(textstat.syllable_count)

In [None]:
text_only_df['pos_tags'] = text_only_df['tweet_no_handle'].apply(build_POS_list)


In [None]:
X = text_only_df.drop(['tweet_text', 'labels'], axis=1)
y = text_only_df['labels']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)

In [None]:
vec = vectorizer.Vectorizer(tokenizer='porter',
                   encoding='latin-1',
                   min_df=5,
                   ngram_range=(1,3))
pos_vectorizer = CountVectorizer(ngram_range=(1,3), min_df=5)
text_only_df['pos_tag_ngrams'] = pos_vectorizer.fit_transform(text_only_df['pos_tags']).todense().tolist()

In [None]:
text_only_df.columns

In [None]:
def create_features(df, feature_cols, vec, pos_vectorizer):
    features = df[feature_cols].values
    n_gram_vector = vec.vectorizer.transform(df['tweet_no_handle'].values)
    pos_ngram_vector = pos_vectorizer.transform(df['pos_tags'].values)
    feature_vector = np.concatenate((n_gram_vector.todense(), features, pos_ngram_vector.todense()), axis=1)
    
    return feature_vector

In [None]:
feature_cols = [u'reading_ease',
                u'reading_grade', 
                u'sentiment', 
                u'subjectivity', 
                u'mentions_count',
                u'mentions_bool', 
                u'hashtag_count', 
                u'hashtag_bool', 
                u'has_url',
                u'tweet_length', 
                u'word_count', 
                u'syllable_count']

In [None]:
train_text = X_train['tweet_no_handle'] 
vec.fit(train_text);

pos_vectorizer.fit(X_train['pos_tags'].tolist())

In [None]:
train_features = create_features(X_train, feature_cols, vec, pos_vectorizer)


In [None]:
test_features = create_features(X_test, feature_cols, vec, pos_vectorizer)

In [None]:
test_features.shape

## Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000]}
]

log_r = LogisticRegression(class_weight='balanced')

In [None]:
def test_model(base_model, param_grid):
    grid_clf = GridSearchCV(base_model, param_grid, cv=5)
    grid_clf.fit(train_features, y_train)
    preds = grid_clf.predict(test_features)
    print(classification_report(y_test, preds))
    return grid_clf

In [None]:
def top_words(clf, label, top):
    for i in clf.best_estimator_.coef_[label, :].argsort()[::-1][:top]:
        top_words = (i, clf.best_estimator_.coef_[0, i], vec.vectorizer.get_feature_names()[i])
        print "{}".format(top_words[2])

In [None]:
test_model(log_r, param_grid)

In [None]:
param_grid = {'n_estimators':[10, 25, 50, 100]}
forest = RandomForestClassifier()

In [None]:
fit_forest = test_model(forest, param_grid)

In [None]:
log_r = LogisticRegression(class_weight='balanced')
log_r.fit(X_train, y_train)

In [None]:
preds = log_r.predict(X_test)
print(classification_report(y_test, preds))

# Dimensionality Reduction

In [None]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_features, y_train)
lsvc_select = SelectFromModel(lsvc, prefit=True)
X_train = lsvc_select.transform(train_features)
X_test = lsvc_select.transform(test_features)

In [None]:
X_test.shape

# Plots

In [None]:
text_only_df.sentiment.hist()

In [None]:
sns.stripplot(x='labels', data=text_only_df, y='sentiment', jitter=True)

In [None]:
sns.barplot(data=text_only_df, x='labels', y='sentiment')

In [None]:
text_only_df[(text_only_df['labels']=='hate') & (text_only_df.sentiment > 0.8)].tweet_text.tolist()

In [None]:
text_only_df.subjectivity.hist()

In [None]:
sns.stripplot(x='labels', data=text_only_df, y='subjectivity', jitter=True, size=1)

In [None]:
sns.barplot(data=text_only_df, x='labels', y='subjectivity')

In [None]:
text_only_df.reading_ease.hist(bins=100)

In [None]:
text_only_df.reading_grade.hist(bins=100)

In [None]:
sns.stripplot(data=text_only_df, x='labels', y='reading_ease', jitter=True)

In [None]:
sns.stripplot(data=text_only_df, x='labels', y='reading_grade', jitter=True)

In [None]:
sns.pointplot(x='labels', y='reading_ease', data=text_only_df)

In [None]:
sns.pointplot(x='labels', y='reading_grade', data=text_only_df)