## Final Project Submission

Please fill out:
* Student name: 
* Student pace: self paced / part time / full time
* Scheduled project review date/time: 
* Instructor name: 
* Blog post URL:


In [378]:
# Import Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, recall_score, precision_score
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud
import warnings
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/albertcc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/albertcc/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [344]:
data = pd.read_csv('data/tweets.csv')
data.columns = ['text', 'device', 'emotion']

In [345]:
data.head()

Unnamed: 0,text,device,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [346]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     9092 non-null   object
 1   device   3291 non-null   object
 2   emotion  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [347]:
data['emotion'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: emotion, dtype: int64

### Pre-Data Cleaning Column Addition

In [348]:
### Creating a column that counts the amount of mentions in each tweet

data['mentions'] = data.text.str.count('@')

In [349]:
### Creating a column that counts teh amount of links in a tweet
data['links'] = 0

url_like_strings = ['{link}', '.com', 'http', 'bit.ly', '.co']
for s in url_like_strings:
    data['links'] = data.links + list(map(lambda x: str(x).count(s), data['text']))

### Data Cleaning

In [350]:
### Dropping 'I can't tell' and 'Other' rows

data = data[data['emotion'] != "I can't tell"]

### Dropping blank 'text' rows

data = data.dropna(subset=['text'])

In [351]:
data['emotion'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
Name: emotion, dtype: int64

In [352]:
data['device'].value_counts(normalize=True)

iPad                               0.287020
Apple                              0.200792
iPad or iPhone App                 0.143205
Google                             0.130713
iPhone                             0.090189
Other Google product or service    0.088970
Android App                        0.024680
Android                            0.023766
Other Apple product or service     0.010664
Name: device, dtype: float64

### Clean text data using functions

In [353]:
### Creating a function that makes all text lowercase for further analysis

def lower_case(text):
    text = text.lower()
    return text

### Creating a function that removes the use of via in the context of via hashtag or via mention (removes 80% of vias)

def remove_via(text):
    
    if 'via @' in text or 'via #' in text:
        text = text.replace('via', '')
    return text
    
### Creating a function that removes errant html syntax from the tweet (e.g. &amp; and &quot;)

def remove_html(text):
    
    text = text.replace('&amp;', '')
    text = text.replace('&quot;', '')
    return text

### Creating a function that removes urls or instances of '{link}' from the tweet

def remove_url(text):
    
    url_like_strings = ['{link}', '.com', 'http', 'bit.ly', '.co']
    text = text.split()
    for s in url_like_strings:
        text = [word for word in text if s not in word]
    text = ' '.join(text)
    return text

### Creating a function that removes words that contain a @ and rt (retweet) 
### as mentions would not be important in determining the emotion of a tweet

def remove_at_and_rt(text):
    text = text.split()
    text = [word for word in text if '@' not in word]
    text = [word for word in text if word != 'rt']
    text = ' '.join(text)
    return text

### Creating a function that removes '#SXSW' of any case type from the text

def remove_sxsw(text):
    text = text.split()
    text = [word for word in text if '#sxsw' not in word]
    text = ' '.join(text)
    return text

### Creating a function that uses a regex tokenizer to remove punctuation but ignores contraction apostrophes

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+\'?\w+')
    text = tokenizer.tokenize(text)
    text = ' '.join(text)
    return text

### Creating a function that removes stopwords from a specified list of stopwords

custom_stop_words = ['in','of','at','a','the']

def remove_stopwords(text, stop_words_list = set(stopwords.words('english'))):
    text = text.split()
    text = [word for word in text if word not in stop_words_list]
    text = ' '.join(text)
    return text

### Creating a function that removes non-ASCII characters

def remove_characters(text):
    text = text.encode('ascii', 'ignore').decode('ascii')
    return text

### Creating a function that lemmatizes words

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

### Creating a function that combines all of the above functions

def clean_text(text):
    text = lower_case(text)
    text = remove_via(text)
    text = remove_html(text)
    text = remove_url(text)
    text = remove_at_and_rt(text)
    text = remove_sxsw(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_characters(text)
    text = lemmatize(text)
    return text

In [354]:
### Testing out the function with some random text
clean_text('Hello this is the test text THAT I have #sb #sxsw eee @!twitter')

'hello test text sb eee'

In [355]:
### Additional test of before and after
print(data.text[60])
print(clean_text(data['text'][60]))

&quot;via @mention : {link} Guy Kawasaki talks 'Enchanted' at SXSW - HE knows his stuff! #books #internet #Apple #sxsw  &quot;
guy kawasaki talk enchanted sxsw know stuff book internet apple


### Feature Engineering

In [356]:
apple = ['iPad', 'Apple', 'iPad or iPhone App', 'iPhone', 'Other Apple product or service']
google = ['Google', 'Other Google product or service', 'Android App', 'Android']

In [357]:
### Creating a new column for google vs. apple vs. unknown

data['device_type'] = np.where(data['device'].isin(google), 'Google', 
                    np.where(data['device'].isin(apple), 'Apple', 
                             'Unknown'))

In [358]:
### Creating a new column for 'Google' and 'Apple' based on device type and key words in the 'text' column

google_key_words = ["Google", "Android", "Pixel", "Circles", "Droid", "Galaxy S", "Realtime", "Maps", "Google Maps", "Circle" ]

apple_key_words = ["Apple", "iPhone", "iPad", "Mac", "iMac", "iPod", "iTunes", "iWatch", "iMessage", "iCloud", "iBook", "iMac", "app_store", "app store", "ios", "ios4", "ios4.1", "ios4.2", "iphone app", "3g", "ios"]
                  
data['Google'] = np.where(data['device_type'] == 'Google', True, 
               np.where(data['text'].str.lower().str.contains('|'.join(google_key_words), case=False), True, 
               False))

data['Apple'] = np.where(data['device_type'] == 'Apple', True,
              np.where(data['text'].str.lower().str.contains('|'.join(apple_key_words), case=False), True,
              False))

### Create new column 'both' that is true if both Google and Apple are true

data['both'] = np.where((data['Google'] == True) & (data['Apple'] == True), True, False)

### Dropping rows where both Google and Apple are true and where Google and Apple are both false

data = data[data['both'] == False]
data = data[data['Google'] != data['Apple']]
data = data.drop(columns=['both'])

In [359]:
data

Unnamed: 0,text,device,emotion,mentions,links,device_type,Google,Apple
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,1.0,0,Apple,False,True
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,2.0,0,Apple,False,True
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1.0,0,Apple,False,True
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,1.0,0,Apple,False,True
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1.0,0,Google,True,False
...,...,...,...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,0.0,1,Apple,False,True
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,1.0,1,Unknown,True,False
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0.0,0,Unknown,True,False
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0.0,0,Unknown,False,True


In [360]:
data[data['device_type']=='Apple']['emotion'].value_counts()

Positive emotion                      1898
Negative emotion                       381
No emotion toward brand or product      65
Name: emotion, dtype: int64

In [361]:
data[data['device_type']=='Google']['emotion'].value_counts()

Positive emotion                      692
Negative emotion                      122
No emotion toward brand or product     26
Name: emotion, dtype: int64

### Vader Score sentiment

In [362]:
### Creating new columns in the dataframe which append 'pos', 'neg', and 'neu' using VADER sentiment analysis

sid = SentimentIntensityAnalyzer()
data['sentiment'] = data['text'].apply(lambda x: sid.polarity_scores(x))
data = pd.concat([data.drop(['sentiment'], axis=1), data['sentiment'].apply(pd.Series)], axis=1)

In [363]:
### Creating an 'emphasis' column that scores how many exclamation points, question marks, and capital letters are in the text

data['punc_emphasis'] = data['text'].apply(lambda x: sum([1 for char in x if char in ['!', '?']]))
data['capt_emphasis'] = data['text'].apply(lambda x: sum([1 for char in x if char.isupper()]))

In [364]:
data

Unnamed: 0,text,device,emotion,mentions,links,device_type,Google,Apple,neg,neu,pos,compound,punc_emphasis,capt_emphasis
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,1.0,0,Apple,False,True,0.203,0.797,0.000,-0.6800,1,15
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,2.0,0,Apple,False,True,0.000,0.576,0.424,0.9100,1,10
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1.0,0,Apple,False,True,0.000,1.000,0.000,0.0000,0,7
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,1.0,0,Apple,False,True,0.000,0.663,0.337,0.7269,0,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1.0,0,Google,True,False,0.000,0.796,0.204,0.6249,0,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,0.0,1,Apple,False,True,0.000,1.000,0.000,0.0000,0,5
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,1.0,1,Unknown,True,False,0.208,0.792,0.000,-0.4939,0,4
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0.0,0,Unknown,True,False,0.000,1.000,0.000,0.0000,0,9
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0.0,0,Unknown,False,True,0.109,0.891,0.000,-0.4019,0,10


In [365]:
data[data['pos'] > 0.1]

Unnamed: 0,text,device,emotion,mentions,links,device_type,Google,Apple,neg,neu,pos,compound,punc_emphasis,capt_emphasis
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,2.0,0,Apple,False,True,0.000,0.576,0.424,0.9100,1,10
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,1.0,0,Apple,False,True,0.000,0.663,0.337,0.7269,0,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1.0,0,Google,True,False,0.000,0.796,0.204,0.6249,0,14
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion,0.0,0,Google,True,False,0.000,0.822,0.178,0.6369,0,8
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion,2.0,2,Apple,False,True,0.000,0.691,0.309,0.7712,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9072,@mention your iPhone 4 cases are Rad and Ready...,iPhone,Positive emotion,1.0,1,Apple,False,True,0.103,0.752,0.145,0.2225,2,5
9073,At #SXSW your iphone charger is your best friend.,,No emotion toward brand or product,0.0,0,Unknown,False,True,0.000,0.486,0.514,0.8126,0,5
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion,1.0,0,Apple,False,True,0.000,0.673,0.327,0.7783,0,4
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion,0.0,0,Apple,False,True,0.000,0.409,0.591,0.8264,2,6


In [366]:
data[data['emotion'] == "Positive emotion"]

Unnamed: 0,text,device,emotion,mentions,links,device_type,Google,Apple,neg,neu,pos,compound,punc_emphasis,capt_emphasis
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,2.0,0,Apple,False,True,0.000,0.576,0.424,0.9100,1,10
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1.0,0,Apple,False,True,0.000,1.000,0.000,0.0000,0,7
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1.0,0,Google,True,False,0.000,0.796,0.204,0.6249,0,14
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion,0.0,0,Google,True,False,0.000,0.822,0.178,0.6369,0,8
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion,2.0,2,Apple,False,True,0.000,0.691,0.309,0.7712,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9072,@mention your iPhone 4 cases are Rad and Ready...,iPhone,Positive emotion,1.0,1,Apple,False,True,0.103,0.752,0.145,0.2225,2,5
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion,1.0,0,Apple,False,True,0.000,0.673,0.327,0.7783,0,4
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion,0.0,0,Apple,False,True,0.000,0.409,0.591,0.8264,2,6
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion,0.0,0,Apple,False,True,0.000,1.000,0.000,0.0000,1,13


### Train Test Split

In [367]:
### Performing a train/test split

X = data.drop('emotion', axis=1)
y = data['emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

## Modeling

### First Simple Model - Count Vectorizer / Decision Tree / No Features

In [368]:
### Performing a train test split on the data, only including the 'text' and 'emotion' columns

X1 = data['text']
y1 = data['emotion']

### Adding the tokenizer to the 'text' column in the X features

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X1, y1, test_size=0.2, random_state=1337)

X_train_1 = X_train_1.apply(clean_text)

In [369]:
X_train_1

2687    google map app save life regular basis map 150...
5028    google launch major new social network called ...
3215    nice able use usb charging cord plug add juice...
7888    jealous team android event androidsxsw get swa...
796     google launch major new social network called ...
                              ...                        
3739    tried initiate carpooling ridonkulous taxi lin...
1458                       oh snap 80 party hosted google
992     brown paper window line grows popup apple stor...
218                              hobo shotgun iphone game
3718    google going circle sn we're launching product...
Name: text, Length: 6354, dtype: object

In [370]:
X_train_1[0]

'3g iphone hr tweeting rise_austin dead need upgrade plugin station'

In [371]:
X_test_1

3042    Mayer comes out sans intro, still gets cheers....
6281    RT @mention Love it. at #sxsw: &quot;apple com...
5930    RT @mention Google's Marissa Mayer extolling c...
1759    I'll pay $900 for a new iPad 2, white, 32 GB, ...
5569    RT @mention Best thing I've heard this wknd @m...
                              ...                        
3322    so who's gonna be getting an ipad2 from the ap...
8089    the longest line at #sxsw is at the apple pop ...
913     Hm? Do we need another 1? RT @mention Google t...
4733    Apple plans to Keep Austin Wired, opening a po...
8308    Talk about trying to steal the show... RT @men...
Name: text, Length: 1589, dtype: object

In [372]:
X_test_1[3042]

"Mayer comes out sans intro, still gets cheers. #techrockstar Launches into Google's priority on location - Fast, Fun &amp; Future #sxsw"

In [373]:
### TRAIN - Tokenize the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_1 = X_train_1.apply(lambda x: x.split())
X_train_1 = X_train_1.map(' '.join)

### TRAIN - Vectorize the training data using CountVectorizer

cv = CountVectorizer()
X_train_1 = cv.fit_transform(X_train_1)

### Insert Sampling Technique

sm = SMOTE(random_state=1337)
X_train_1_res, y_train_1_res = sm.fit_resample(X_train_1, y_train_1)

### TRAIN - Fit the training data to a Decision Tree Classifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train_1_res, y_train_1_res)

### VALIDATION - Perform a cross validation on the decision tree classifier

scores = cross_val_score(dtc, X_train_1, y_train_1, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean())

Cross Validation Scores:  [0.65460268 0.64358773 0.63808025 0.63099921 0.64094488]
Mean Cross Validation Score:  0.6416429496273627


In [375]:
### Decision Tree Test Set Preprocessing

X_test_1 = X_test_1.apply(clean_text)
X_test_1 = X_test_1.apply(lambda x: x.split())
X_test_1 = X_test_1.map(' '.join)
X_test_1 = cv.transform(X_test_1)

In [376]:
dtc_pred = dtc.predict(X_test_1)

In [379]:
print('Decision Tree Accuracy: ', accuracy_score(y_test_1, dtc_pred))

Decision Tree Accuracy:  0.5475141598489616


In [374]:
X_train_1_res

<10893x7122 sparse matrix of type '<class 'numpy.int64'>'
	with 111381 stored elements in Compressed Sparse Row format>

In [304]:
X1.shape

(7943,)

In [305]:
y1.shape

(7943,)

In [306]:
X_train_1.shape

(6354, 7122)

In [307]:
y_train_1.shape

(6354,)

In [308]:
X1.head()

0    .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1    @jessedee Know about @fludapp ? Awesome iPad/i...
2    @swonderlin Can not wait for #iPad 2 also. The...
3    @sxsw I hope this year's festival isn't as cra...
4    @sxtxstate great stuff on Fri #SXSW: Marissa M...
Name: text, dtype: object

In [309]:
y1.head()

0    Negative emotion
1    Positive emotion
2    Positive emotion
3    Negative emotion
4    Positive emotion
Name: emotion, dtype: object

**Second Model - Count Vectorizer / Logistic Regression / Added Sentiment and Emphasis Score**

In [266]:
### Performing a train test split on the data, including 'text', VADER scores, and 'emphasis' columns
X2 = data[['text', 'compound', 'neg', 'neu', 'pos', 'punc_emphasis', 'capt_emphasis']]
y2 = data['emotion']

### Performing a train test split on the X2 and Y2 data

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2, y2, test_size=0.2, random_state=1337)


### Applying the tokenizer to the 'text' column in the X features

X_train_2['text'] = X_train_2['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_2['text'] = X_train_2['text'].apply(clean_text)


In [267]:
X_train_2

Unnamed: 0,text,compound,neg,neu,pos,punc_emphasis,capt_emphasis
2687,google map app save life regular basis map 150...,0.0000,0.000,1.000,0.000,0,4
5028,google launch major new social network called ...,0.0000,0.000,1.000,0.000,0,12
3215,nice able use usb charging cord plug add juice...,0.4215,0.000,0.882,0.118,0,11
7888,jealous team android event androidsxsw get swa...,-0.6663,0.208,0.792,0.000,5,2
796,google launch major new social network called ...,0.0000,0.000,1.000,0.000,0,10
...,...,...,...,...,...,...,...
3739,tried initiate carpooling ridonkulous taxi lin...,-0.1027,0.055,0.945,0.000,0,3
1458,oh snap 80 party hosted google,0.5951,0.000,0.699,0.301,5,8
992,brown paper window line grows popup apple stor...,0.0000,0.000,1.000,0.000,0,13
218,hobo shotgun iphone game,0.0000,0.000,1.000,0.000,0,17


In [268]:
X_train_2['text'][2687]

'google map app save life regular basis map 150 million user 40 user mobile'

In [269]:
### Tokenize the training data with simple split of words, flattening to prepare for vectorization
X_train_2['text'] = X_train_2['text'].apply(lambda x: x.split())
X_train_2['text'] = X_train_2['text'].map(' '.join)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_2['text'] = X_train_2['text'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_2['text'] = X_train_2['text'].map(' '.join)


In [270]:
X_train_2['text'][2687]

'google map app save life regular basis map 150 million user 40 user mobile'

In [271]:
second_model = imbpipeline([
    ('cvec', CountVectorizer(encoding = 'iso-8859-1', lowercase = False)),
    ('smote', SMOTE(sampling_strategy='minority', random_state=1337)),
    ('lr', LogisticRegression(random_state=1337, max_iter=1000))
])

In [272]:
second_model.fit(X_train_2, y_train_2)

ValueError: Found input variables with inconsistent numbers of samples: [7, 6354]

In [None]:

print('Training Accuracy Score:', baseline.score(X_train_2, y_train_2))
print('Validation Accuracy Score:', cross_val_score(baseline, X_train_1, y_train_1, cv=5).mean())

### Import confusion_matrix and print the confusion matrix for the validation data

from sklearn.metrics import confusion_matrix

y_pred_1 = baseline.predict(X_test_1)
cm = confusion_matrix(y_test_1, y_pred_1)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=baseline.classes_)
disp.plot()

### Instantiate the and vectorize the training data using TFIDVectorizer

In [210]:
cv = CountVectorizer()
X_train_2 = cv.fit_transform(X_train_2['text'])

In [211]:
X_train_2

<6354x7122 sparse matrix of type '<class 'numpy.int64'>'
	with 57972 stored elements in Compressed Sparse Row format>

In [212]:
lr = LogisticRegression(random_state=1337, max_iter=1000)
lr.fit(X_train_2, y_train_2)


LogisticRegression(max_iter=1000, random_state=1337)

In [213]:
print('Training Accuracy Score:', lr.score(X_train_2, y_train_2))

Training Accuracy Score: 0.8991186654076172


In [214]:
scores = cross_val_score(lr, X_train_2, y_train_2, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean()) 

Cross Validation Scores:  [0.65066876 0.67820614 0.6884343  0.6711251  0.69291339]
Mean Cross Validation Score:  0.6762695379049295


In [215]:
lr.coef_

array([[ 0.05702658, -0.02378017, -0.01927102, ...,  0.27820585,
        -0.06440298, -0.04509425],
       [ 0.28577321,  0.17652163,  0.29010016, ...,  0.17998232,
        -0.07144259,  0.15929976],
       [-0.34279979, -0.15274145, -0.27082915, ..., -0.45818818,
         0.13584557, -0.11420551]])

- Appears to be overfitting to the training data, but this is expected with logistic regression

In [None]:
### Creating an imbpalance-learn pipeline that uses SMOTE to oversample the minority classes and then uses a Logistic Regression to predict the emotion of a tweet

# baseline2 = imbpipeline([
#      ('cvec', CountVectorizer(encoding = 'iso-8859-1', lowercase = False)),
#      ('smote', SMOTE(sampling_strategy='minority', random_state=1337)),
#      ('lr', LogisticRegression(random_state=1337, max_iter=1000))
#  ])

### Fitting the pipeline to the training data and printing the training and validation accuracy scores

# baseline2.fit(X_train_2, y_train_2.values.ravel())
# print('Training Accuracy Score:', baseline2.score(X_train_2, y_train_2))
# print('Validation Accuracy Score:', cross_val_score(baseline2, X_train_2, y_train_2, cv=5).mean())

In [216]:
### Splitting the features into features and the target, including the 'text', sentiment columns, and emphasis columns as features

X3 = data[['text', 'compound', 'neg', 'neu', 'pos', 'punc_emphasis','capt_emphasis']]
y3 = data['emotion']

### Performing a train test split on the data

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X3, y3, test_size=0.2, random_state=1337)

### TRAIN - Applying the clean_text function to the training data

X_train_3['text'] = X_train_3['text'].apply(clean_text)

### TRAIN - Tokenizing the training data with a simple split of words, and then flattening to prepare for vectorization

X_train_3['text'] = X_train_3['text'].apply(lambda x: x.split())
X_train_3['text'] = X_train_3['text'].map(' '.join)

### TRAIN - Vectorizing the training data using a TD-IDF Vectorizer

tfidf = TfidfVectorizer()
X_train_3 = tfidf.fit_transform(X_train_3['text'])

### TRAIN - Fit the training data to a Random Forest Classifier

rfc = RandomForestClassifier()
rfc.fit(X_train_3, y_train_3)

### VALIDATION - Perform a cross validation on the Random Forest Classifier

scores = cross_val_score(rfc, X_train_3, y_train_3, cv=5)
print('Cross Validation Scores: ', scores)
print('Mean Cross Validation Score: ', scores.mean()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_3['text'] = X_train_3['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_3['text'] = X_train_3['text'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_3['text'] = X_train_3['text'].map(' '.join)


Cross Validation Scores:  [0.66955153 0.66719119 0.66404406 0.65460268 0.65984252]
Mean Cross Validation Score:  0.6630463953610835


### Preprocessing Test Data on Model

In [217]:
### Decision Tree Test Set Preprocessing

X_test_1 = X_test_1.apply(clean_text)
X_test_1 = X_test_1.apply(lambda x: x.split())
X_test_1 = X_test_1.map(' '.join)
X_test_1 = cv.transform(X_test_1)

### Naive Bayes Test Set Preprocessing

X_test_2['text'] = X_test_2['text'].apply(clean_text)
X_test_2['text'] = X_test_2['text'].apply(lambda x: x.split())
X_test_2['text'] = X_test_2['text'].map(' '.join)
X_test_2 = cv.transform(X_test_2['text'])

### Random Forest Test Set Preprocessing

X_test_3['text'] = X_test_3['text'].apply(clean_text)
X_test_3['text'] = X_test_3['text'].apply(lambda x: x.split())
X_test_3['text'] = X_test_3['text'].map(' '.join)
X_test_3 = tfidf.transform(X_test_3['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_2['text'] = X_test_2['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_2['text'] = X_test_2['text'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_2['text'] = X_test_2['text'].map(' '.join)
A value is trying to be set on a copy 