**EDA Analysis over NLP Raw Data**

**Package and Data Import**

In [17]:
### Packages to Import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('wordnet')
import nltk
import ssl

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moore\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
### Import Data and Change Column Names

df = pd.read_csv("data/tweets.csv")
df.columns = ['text', 'device', 'emotion']

**Data Cleaning Prior to Train / Test Split**

In [19]:
### Deliniating between Google and Apple

google_tweets = ['Google', 'Other Google product or service', 'Andriod App', 'Andriod']
apple_tweets = ['Apple', 'Other Apple product or service', 'Apple App', 'iPhone', 'iPad', 'iPad or iPhone App']

### Creating a new column for google vs. apple vs. unknown

df['device_type'] = np.where(df['device'].isin(google_tweets), 'Google', np.where(df['device'].isin(apple_tweets), 'Apple', 'Unknown'))

### Dropping 'I can't tell' and 'Other' rows

df = df[df['emotion'] != "I can't tell"]

### Dropping blank 'text' rows

df = df.dropna(subset=['text'])


**Performing Train / Test Split**

In [20]:
### Performing a train/test split

X = df.drop('emotion', axis=1)
y = df['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

**Creating Functions that Clean and Tokenize the Text**

In [21]:
### Creating a function that removes words that begin with @

def remove_at(text):
    text = text.split()
    text = [word for word in text if not word.startswith('@')]
    text = ' '.join(text)
    return text

### Creating a function that makes all text lowercase

def lower_case(text):
    text = text.lower()
    return text

### Creating a function that removes all punctuation

def remove_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

### Creating a function that removes all stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = text.split()
    text = [word for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

### Creating a function that removes words that contain characters like ‰ÛÏ or ‰ÛÒ or ‰ÛÓ or ‰ÛÒ

def remove_characters(text):
    text = text.split()
    text = [word for word in text if not word.startswith('‰ÛÏ') and not word.startswith('‰ÛÒ') and not word.startswith('‰ÛÓ') and not word.startswith('‰ÛÒ')]
    text = ' '.join(text)
    return text

### Creating a function that lemmatizes words

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

### Creating a function that removes all numbers

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text

### Creating a function takes the tokenized text and returns a string of words

def tokenize_to_string(text):
    tknzr = TweetTokenizer()
    text = tknzr.tokenize(text)
    text = ' '.join(text)
    return text

### Creating a function that combines all of the above functions

def clean_text(text):
    text = remove_at(text)
    text = lower_case(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_characters(text)
    text = lemmatize(text)
    text = remove_numbers(text)
    text = tokenize_to_string(text)
    return text

**Applying the clean_text function to the training data**

In [22]:
### Applying the 'lemmatize_text' function to the 'text' column

X_train['text'] = X_train['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['text'] = X_train['text'].apply(clean_text)


In [23]:
### Vectorizing the text column with TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['text'])

In [24]:
### Applying SMOTE to the training data

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=1337)
X_train_vectorized, y_train = sm.fit_resample(X_train_tfidf, y_train)

**Fitting the training data to a random forest model**

In [25]:
### Fitting the training data to a random forest classifier

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1337)
rfc.fit(X_train_vectorized, y_train)

### Applying the best parameters to the random forest classifier

rfc = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=1337)
rfc.fit(X_train_vectorized, y_train)

RandomForestClassifier(max_depth=30, n_estimators=300, random_state=1337)

In [26]:
### Cross validating the model

from sklearn.model_selection import cross_val_score
cross_val_score(rfc, X_train_vectorized, y_train, cv=5)

array([0.72751938, 0.77635659, 0.80379992, 0.84373788, 0.85847228])

**Testing the model on the test data**

In [27]:
### Applying preprocessing to the test data

X_test['text'] = X_test['text'].apply(clean_text)
X_test['text'] = X_test['text'].apply(lambda x: ' '.join(x))

### Transforming the test data

X_test_vectorized = tfidf.transform(X_test['text'])

### Predicting the test data

y_pred = rfc.predict(X_test_vectorized)

### Evaluating the accuracy, recall, and precision of the model

from sklearn.metrics import accuracy_score, recall_score, precision_score
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
print('Precision: ', precision_score(y_test, y_pred, average='weighted'))

Accuracy:  0.6090604026845637
Recall:  0.6090604026845637
Precision:  0.370954574118283


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['text'] = X_test['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['text'] = X_test['text'].apply(lambda x: ' '.join(x))
  _warn_prf(average, modifier, msg_start, len(result))
