In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
import src.vectorizer as v
import src.preprocessing as p
import re
from textstat.textstat import textstat
from textblob import TextBlob
import seaborn as sns

%matplotlib inline

In [2]:
text_only_df = pd.read_csv('data/labels_and_text_only.csv')

In [4]:
def remove_handles(content):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)"," ",content).split())

In [5]:
text_only_df['tweet_no_handle'] = text_only_df['tweet_text'].apply(remove_handles)

## Adding Reading Scores

In [6]:
text_only_df['reading_ease'] = text_only_df['tweet_no_handle'].apply(textstat.flesch_reading_ease)
text_only_df['reading_grade'] = text_only_df['tweet_no_handle'].apply(textstat.flesch_kincaid_grade)

## Adding Sentiment Analysis

In [9]:
text_only_df['sentiment'] = text_only_df['tweet_text'].map(lambda x: TextBlob(x.decode('latin-1')).polarity)
text_only_df['subjectivity'] = text_only_df['tweet_text'].map(lambda x: TextBlob(x.decode('latin-1')).subjectivity)

In [11]:
text_only_df.columns

Index([u'Unnamed: 0', u'labels', u'tweet_text', u'tweet_no_handle',
       u'reading_ease', u'reading_grade', u'sentiment', u'subjectivity'],
      dtype='object')

In [13]:
text_only_df.drop(['Unnamed: 0'],axis=1, inplace=True)

In [14]:
text_only_df.columns

Index([u'labels', u'tweet_text', u'tweet_no_handle', u'reading_ease',
       u'reading_grade', u'sentiment', u'subjectivity'],
      dtype='object')

In [22]:
X = text_only_df.drop(['tweet_text', 'labels'], axis=1)
y = text_only_df['labels']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [24]:
vec = v.Vectorizer(tokenizer='porter',
                   encoding='latin-1',
                   min_df=2,
                   ngram_range=(1,3))

In [31]:
train_text = X_train['tweet_no_handle']
test_text = X_test['tweet_no_handle']

In [32]:
train_vectors = vec.fit_transform(train_text)

In [33]:
test_vectors = vec.vectorizer.transform(test_text)

In [43]:
train_vectors.shape

(13058, 27624)

In [35]:
test_vectors.shape

(1451, 27624)

In [39]:
feature_cols = [u'reading_ease', u'reading_grade', u'sentiment', u'subjectivity']

In [40]:
features = X_train[feature_cols].values

In [41]:
features.shape

(13058, 4)

In [46]:
train = np.concatenate((train_vectors.todense(), features), axis=1)

In [52]:
train.shape

(13058, 27628)

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV

In [48]:
param_grid = [
  {'C': [1, 10, 100, 1000]}
]

clf = GridSearchCV(LogisticRegression(class_weight='balanced'), param_grid, cv=5)
clf.fit(train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000]}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [49]:
test_features = X_test[feature_cols].values

In [50]:
test = np.concatenate((test_vectors.todense(), test_features), axis=1)

In [51]:
preds = clf.predict(test)
print(classification_report(y_test, preds))

               precision    recall  f1-score   support

         hate       0.57      0.53      0.55       256
not offensive       0.83      0.96      0.89       716
    offensive       0.75      0.61      0.67       479

  avg / total       0.76      0.77      0.76      1451



In [None]:
text_only_df.sentiment.hist()

In [None]:
sns.stripplot(x='labels', data=text_only_df, y='sentiment', jitter=True)

In [None]:
sns.barplot(data=text_only_df, x='labels', y='sentiment')

In [None]:
text_only_df[(text_only_df['labels']=='hate') & (text_only_df.sentiment > 0.8)].tweet_text.tolist()

In [None]:
text_only_df.subjectivity.hist()

In [None]:
sns.stripplot(x='labels', data=text_only_df, y='subjectivity', jitter=True, size=1)

In [None]:
sns.barplot(data=text_only_df, x='labels', y='subjectivity')

In [None]:
text_only_df.columns

In [None]:
text_only_df.reading_ease.hist(bins=100)

In [None]:
text_only_df.reading_grade.hist(bins=100)

In [None]:
sns.stripplot(data=text_only_df, x='labels', y='reading_ease', jitter=True)

In [None]:
sns.stripplot(data=text_only_df, x='labels', y='reading_grade', jitter=True)

In [None]:
sns.pointplot(x='labels', y='reading_ease', data=text_only_df)

In [None]:
sns.pointplot(x='labels', y='reading_grade', data=text_only_df)

In [None]:
X_train.shape

In [None]:
text_only_df.columns

In [None]:
text_only_df.sentiment.shape

In [None]:
X = text_only_df.tweet_no_handle.values