In [None]:
import nltk

In [None]:
# Set random seed
seed = 123
# Data manipulation/analysis
import numpy as np
import pandas as pd
# Text preprocessing/analysis
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
# Modelling
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid", context='talk')


In [None]:
sample = pd.read_csv('depressedData_Edited.csv')
#print(f"{sample.shape[0]} rows and {sample.shape[1]} columns")
sample.head()

In [None]:
#looking at splits 1 for depressed 0 for not depressed
sample['sentiment'].value_counts()

In [None]:
file = 'sentiment140_processedv3.csv' 

import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result


df= pd.read_csv(file,encoding='ISO-8859-1')

df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
# Select the ones you want
sample1 = sample[['new_tweets','sentiment']]

# Encode to numeric
sample1['target'] = np.where(sample['sentiment']=='depressed', 1, 0)
# Check values
sample1.groupby(['sentiment', 'target']).count().unstack()

In [None]:
# Select the ones you want
df1 = df[['new_tweets','sentiment']]

# Encode to numeric
df1['target'] = np.where(df['sentiment']=='negative', 1, 0)
# Check values
df1.groupby(['sentiment', 'target']).count().unstack()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sample1['new_tweets'], sample['target'], test_size=6200, random_state=seed)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

print(f"Train: {train.shape[0]} rows and {train.shape[1]} columns")
print(f"{train['target'].value_counts()}\n")
print(f"Test: {test.shape[0]} rows and {test.shape[1]} columns")
print(test['target'].value_counts())

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df1['new_tweets'], df1['target'], test_size=112000, random_state=seed)

train1 = pd.concat([X_train1, y_train1], axis=1)
test1 = pd.concat([X_test1, y_test1], axis=1)

print(f"Train1: {train1.shape[0]} rows and {train1.shape[1]} columns")
print(f"{train1['target'].value_counts()}\n")
print(f"Test1: {test1.shape[0]} rows and {test1.shape[1]} columns")
print(test1['target'].value_counts())

In [None]:
train.head(15)

In [None]:
train1.head(15)

In [None]:
test.head(15)

In [None]:
test1.head(15)

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
train[['dep', 'neu', 'nondep', 'compound']] = train['new_tweets'].apply(sid.polarity_scores).apply(pd.Series)
train.head()

In [None]:
for var in ['pos', 'neg', 'neu', 'compound']:
    plt.figure(figsize=(9,4))
    sns.distplot(train.query("target==1")[var], bins=30, kde=False, 
                 color='green', label='Positive')
    sns.distplot(train.query("target==0")[var], bins=30, kde=False, 
                 color='red', label='Negative')
    plt.legend()
    plt.title(f'Histogram of {var} by true sentiment');

In [None]:
train['vader_polarity'] = np.where(train['pos']>train['neg'], 0, 1)
target_names=['negative', 'positive']
print(classification_report(train['target'], 
                            train['vader_polarity'], 
                            target_names=target_names))

In [None]:
# Create function so that we could reuse later
def plot_cm(y_test, y_pred, target_names=['negative', 'positive'], 
            figsize=(5,3)):
    """Create a labelled confusion matrix plot."""
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, annot=True, fmt='g', cmap='BuGn', cbar=False, 
                ax=ax)
    ax.set_title('Confusion matrix')
    ax.set_xlabel('Predicted')
    ax.set_xticklabels(target_names)
    ax.set_ylabel('Actual')
    ax.set_yticklabels(target_names, 
                       fontdict={'verticalalignment': 'center'});
# Plot confusion matrix
plot_cm(train['target'], train['vader_polarity'])

In [None]:
train['vader_compound'] = np.where(train['compound']>0, 1, 0)
print(classification_report(train['target'], 
                            train['vader_compound'], 
                            target_names=target_names))

In [None]:
# Define functions
def create_baseline_models():
    """Create list of baseline models."""
    models = []
    models.append(('log', LogisticRegression(random_state=seed, 
                                             max_iter=1000)))
    models.append(('sgd', SGDClassifier(random_state=seed)))
    models.append(('mnb', MultinomialNB()))
    return models
def assess(X, y, models, cv=5, scoring=['roc_auc', 
                                        'accuracy', 
                                        'f1']):
    """Provide summary of cross validation results for models."""
    results = pd.DataFrame()
    for name, model in models:
        result = pd.DataFrame(cross_validate(model, X, y, cv=cv, 
                                             scoring=scoring))
        mean = result.mean().rename('{}_mean'.format)
        std = result.std().rename('{}_std'.format)
        results[name] = pd.concat([mean, std], axis=0)
    return results.sort_index()

In [None]:
models = create_baseline_models()
models

In [None]:
# Preprocess the data
vectoriser = TfidfVectorizer(token_pattern=r'[a-z]+', 
                             stop_words='english', 
                             min_df=30, 
                             max_df=.7)
X_train_simpler = vectoriser.fit_transform(X_train)
# Assess the model
assess(X_train_simpler, y_train, models)

In [None]:

# Define function
def preprocess_text(text):
    # 1. Tokenise to alphabetic tokens
    tokeniser = RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokeniser.tokenize(text)
    
    # 2. Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t.lower(), pos='v') 
              for t in tokens]
    return tokens
# Preprocess the data
vectoriser = TfidfVectorizer(analyzer=preprocess_text, 
                             min_df=30, 
                             max_df=.7)
X_train_simple = vectoriser.fit_transform(X_train)
# Assess models
assess(X_train_simple, y_train, models)

In [None]:
plt.figure(figsize = (14,5))
columns = ['target', 'neg', 'neu', 'pos', 'compound']
sns.heatmap(train[columns].corr(), annot=True, cmap='seismic_r');

In [None]:
# Initialise a model
sgd = SGDClassifier(random_state=seed)
# Initialise a scaler
scaler = MinMaxScaler()
# Assess the model using scores
scores = train[['neg', 'neu', 'pos', 'compound']]
assess(scaler.fit_transform(scores), y_train, [('sgd', sgd)])

In [None]:
# Fit to training data
sgd.fit(scores, y_train)
# Get coefficients
coefs = pd.DataFrame(data=sgd.coef_, columns=scores.columns).T
coefs.rename(columns={0: 'coef'}, inplace=True)
# Plot
plt.figure(figsize=(10,5))
sns.barplot(x=coefs.index, y='coef', data=coefs)
plt.title('Coefficients');

In [None]:


pipe = Pipeline([('vectoriser', TfidfVectorizer(encoding='ISO-8859-1',token_pattern=r'[a-z]+', min_df=30, max_df=.6, ngram_range=(1,2))),
                 ('model', SGDClassifier(random_state=seed, loss='log'))])
pipe.fit(X_train.values.astype('U'),y_train)



In [None]:
coefs = pd.DataFrame(pipe['model'].coef_, 
                     columns=pipe['vectoriser'].get_feature_names())
coefs = coefs.T.rename(columns={0:'coef'}).sort_values('coef')
coefs

In [None]:
train_pred = pipe.predict(X_train)
print(classification_report(train_pred, 
                            y_train, 
                            target_names=target_names))

In [None]:
train_pred = pipe.predict(X_test)
print(classification_report(train_pred, 
                            y_test, 
                            target_names=target_names))

In [None]:
train_pred = pipe.predict(X_train1.values.astype('U'))
print(classification_report(train_pred, 
                            y_train1, 
                            target_names=target_names))

In [None]:
test_pred = pipe.predict(X_test1.values.astype('U'))
print(classification_report(test_pred, 
                            y_test1, 
                            target_names=target_names))