In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, roc_curve, auc, accuracy_score
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read in data locally
#df = pd.read_csv('data/Tweets.csv', encoding='latin1')

# Read in data using google colab/drive
df = pd.read_csv('/content/drive/MyDrive/Flatiron/project_4/tweets.csv', encoding='latin1')

In [39]:
numerical_features = list(df.select_dtypes("number"))
categorical_features = list(df.select_dtypes("object"))

print(f'Numerical Columns: \n{numerical_features}\n')
print(f'Numerical Columns: \n{categorical_features}')

Numerical Columns: 
['character_count', 'word_count', 'is_tweet_negative', 'is_apple', 'number_of_mentions', 'hashtags_count', 'average_word_length']

Numerical Columns: 
['tweet', 'emotion']


## EDA

In [44]:
df.head()

Unnamed: 0,tweet,emotion,character_count,word_count,is_tweet_negative,is_apple,number_of_mentions,hashtags_count,average_word_length
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,negative,127,23,1,1,1,2,4.521739
1,@jessedee know about @fludapp ? awesome ipad/i...,positive,139,22,0,1,2,1,5.363636
2,@swonderlin can not wait for #ipad 2 also. the...,positive,79,15,0,1,1,2,4.333333
3,@sxsw i hope this year's festival isn't as cra...,negative,82,15,1,1,1,1,4.533333
4,@sxtxstate great stuff on fri #sxsw: marissa m...,positive,131,17,0,0,1,1,6.764706


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8369 entries, 0 to 9092
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tweet                8369 non-null   object 
 1   emotion              8369 non-null   object 
 2   character_count      8369 non-null   int64  
 3   word_count           8369 non-null   int64  
 4   is_tweet_negative    8369 non-null   int64  
 5   is_apple             8369 non-null   Int64  
 6   number_of_mentions   8369 non-null   int64  
 7   hashtags_count       8369 non-null   int64  
 8   average_word_length  8369 non-null   float64
dtypes: Int64(1), float64(1), int64(5), object(2)
memory usage: 662.0+ KB


In [42]:
df.nunique().sort_values(ascending=True)

is_tweet_negative         2
is_apple                  2
emotion                   4
number_of_mentions        9
hashtags_count           13
word_count               30
character_count         143
average_word_length     785
tweet                  8326
dtype: int64

In [41]:
df.isna().sum().sort_values(ascending=False)

tweet                  0
emotion                0
character_count        0
word_count             0
is_tweet_negative      0
is_apple               0
number_of_mentions     0
hashtags_count         0
average_word_length    0
dtype: int64

In [40]:
df.corr()

  df.corr()


Unnamed: 0,character_count,word_count,is_tweet_negative,is_apple,number_of_mentions,hashtags_count,average_word_length
character_count,1.0,0.895978,0.043287,-0.039956,0.280058,0.077787,0.083888
word_count,0.895978,1.0,0.049625,0.101657,0.205961,-0.03352,-0.345719
is_tweet_negative,0.043287,0.049625,1.0,0.039736,-0.059424,-0.033026,-0.015826
is_apple,-0.039956,0.101657,0.039736,1.0,-0.036238,0.037333,-0.295527
number_of_mentions,0.280058,0.205961,-0.059424,-0.036238,1.0,-0.020735,0.100823
hashtags_count,0.077787,-0.03352,-0.033026,0.037333,-0.020735,1.0,0.237964
average_word_length,0.083888,-0.345719,-0.015826,-0.295527,0.100823,0.237964,1.0



## Data Cleaning

In [4]:
# Lowercase
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

# Renaming columns to better reflect the data
rename_cols = {
      'tweet_text': 'tweet',
      'emotion_in_tweet_is_directed_at': 'tweet_is_directed_at',
      'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'
              }
df = df.rename(columns=rename_cols)


# rename values in the 'emotion' column
rename_values = {
      'no emotion toward brand or product': 'no emotion',
      'positive emotion': 'positive',
      'negative emotion': 'negative',
      "i can't tell": 'unsure'
              }
df['emotion'] = df['emotion'].map(rename_values)


# Dropping (1) single row from df['tweet_text']
df.dropna(subset='tweet', inplace=True)


# Filter out 17 tweets that contain both Apple & Google
apple_and_google = ~((df['tweet_is_directed_at'].isna()) &
                   (df['tweet'].str.contains('google')) &
                   (df['tweet'].str.contains('apple')))
df = df[apple_and_google]



## Create assert statements to confirm the code achieved its goals:

assert df.loc[(df['tweet_is_directed_at'].isna()) & (df['tweet'].str.contains('google')) & (df['tweet'].str.contains('apple')), 'tweet'].empty


## Feature Engineering

In [5]:
# Create column of with the character length of the tweet
df['character_count'] = df['tweet'].apply(len)


# Create column of with the word count of the tweet
df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split()))


# Creat column to determine if the tweet was negative toward a brand or not
df['is_tweet_negative'] = df['emotion'].apply(lambda x: 1 if x == 'negative' else 0)


# What company is the tweet about?
renamed_values = {
      'ipad': 1, 'ipad or iphone app': 1,
      'iphone': 1, 'other google product or service': 0,
      'android app': 0, 'android': 0, 'google': 0,
      'other apple product or service': 1, 'apple': 1,
                 }
df['is_apple'] = df['tweet_is_directed_at'].map(renamed_values).astype('Int64')


# Creating a feature call 'number_of_mentions' to count @mentions
df['number_of_mentions'] = df['tweet'].str.count('@\w+')


# Create a feature 'hashtags_count' to count hashtags
df['hashtags_count'] = df['tweet'].str.count('#')


# Create feature named 'average_word_length'
df['average_word_length'] = df['tweet'].apply(lambda x: np.mean([len(word) for word in x.split()]))


## Data Filtering

In [8]:
# Assigning unknown tweet to company based on 'apple_words'
apple_words = ['apple', 'iphone','ipod','ipad', 'i-pad', 'app store', 'itunes']
apple_regex = '|'.join(apple_words)  # creates the string 'apple|iphone|ipod|ipad|i-pad'

df.loc[(df['is_apple'].isna()) & df['tweet'].str.contains(apple_regex, case=False), 'is_apple'] = 1


# Assigning unknown tweet to company based on 'google_words'
google_words = ['google', 'android','pixel', 'google play']
google_regex = '|'.join(google_words)  # creates the string 'google|android|pixel|google play'

df.loc[(df['is_apple'].isna()) & df['tweet'].str.contains(google_regex, case=False), 'is_apple'] = 0


# Dropping tweets that have no mention of either Apple or Google
df.dropna(subset=['is_apple'], inplace=True)


# Drop the column 'tweet_is_directed_at' due to many missing values
#df = df.drop(columns=['tweet_is_directed_at'])

In [45]:
df['is_apple'].value_counts()

1    5567
0    2802
Name: is_apple, dtype: Int64