In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, roc_curve, auc, accuracy_score
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read in data locally
#df = pd.read_csv('data/Tweets.csv', encoding='latin1')

# Read in data using google colab/drive
df = pd.read_csv('/content/drive/MyDrive/Flatiron/project_4/tweets.csv', encoding='latin1')

In [3]:
numerical_features = list(df.select_dtypes("number"))
categorical_features = list(df.select_dtypes("object"))

print(f'Numerical Columns: \n{numerical_features}\n')
print(f'Numerical Columns: \n{categorical_features}')

Numerical Columns: 
[]

Numerical Columns: 
['tweet_text', 'emotion_in_tweet_is_directed_at', 'is_there_an_emotion_directed_at_a_brand_or_product']


## EDA

In [11]:
df.head()

Unnamed: 0,tweet,tweet_is_directed_at,emotion,character_count,word_count,is_tweet_negative,is_apple
0,.@wesley83 i have a 3g iphone. after 3 hrs twe...,iphone,negative,127,23,1,1
1,@jessedee know about @fludapp ? awesome ipad/i...,ipad or iphone app,positive,139,22,0,1
2,@swonderlin can not wait for #ipad 2 also. the...,ipad,positive,79,15,0,1
3,@sxsw i hope this year's festival isn't as cra...,ipad or iphone app,negative,82,15,1,1
4,@sxtxstate great stuff on fri #sxsw: marissa m...,google,positive,131,17,0,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9075 entries, 0 to 9092
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tweet                 9075 non-null   object
 1   tweet_is_directed_at  3291 non-null   object
 2   emotion               9075 non-null   object
 3   character_count       9075 non-null   int64 
 4   word_count            9075 non-null   int64 
 5   is_tweet_negative     9075 non-null   int64 
 6   is_apple              8369 non-null   Int64 
dtypes: Int64(1), int64(3), object(3)
memory usage: 576.0+ KB


In [13]:
df.nunique().sort_values(ascending=True)

is_tweet_negative          2
is_apple                   2
emotion                    4
tweet_is_directed_at       9
word_count                31
character_count          144
tweet                   9030
dtype: int64

In [14]:
df.isna().sum().sort_values(ascending=False)

tweet_is_directed_at    5784
is_apple                 706
tweet                      0
emotion                    0
character_count            0
word_count                 0
is_tweet_negative          0
dtype: int64


## Data Cleaning

In [8]:
# Lowercase
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

# Renaming columns to better reflect the data
rename_cols = {
      'tweet_text': 'tweet',
      'emotion_in_tweet_is_directed_at': 'tweet_is_directed_at',
      'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'
              }
df = df.rename(columns=rename_cols)


# rename values in the 'emotion' column
rename_values = {
      'no emotion toward brand or product': 'no emotion',
      'positive emotion': 'positive',
      'negative emotion': 'negative',
      "i can't tell": 'unsure'
              }
df['emotion'] = df['emotion'].map(rename_values)


# Dropping (1) single row from df['tweet_text']
df.dropna(subset='tweet', inplace=True)


# Filter out 17 tweets that contain both Apple & Google
apple_and_google = ~((df['tweet_is_directed_at'].isna()) &
                   (df['tweet'].str.contains('google')) &
                   (df['tweet'].str.contains('apple')))
df = df[apple_and_google]



## Create assert statements to confirm the code achieved its goals:

assert df.loc[(df['tweet_is_directed_at'].isna()) & (df['tweet'].str.contains('google')) & (df['tweet'].str.contains('apple')), 'tweet'].empty


## Feature Engineering

In [9]:
# Create column of with the character length of the tweet
df['character_count'] = df['tweet'].apply(len)
# Create column of with the word count of the tweet
df['word_count'] = df['tweet'].apply(lambda x: len(str(x).split()))
# Creat column to determine if the tweet was negative toward a brand or not
df['is_tweet_negative'] = df['emotion'].apply(lambda x: 1 if x == 'negative' else 0)



# What company is the tweet about?
renamed_values = {
      'ipad': 1, 'ipad or iphone app': 1,
      'iphone': 1, 'other google product or service': 0,
      'android app': 0, 'android': 0, 'google': 0,
      'other apple product or service': 1, 'apple': 1,
                 }
df['is_apple'] = df['tweet_is_directed_at'].map(renamed_values).astype('Int64')


# Assigning unknown tweet to company based on 'apple_words'
apple_words = ['apple', 'iphone','ipod','ipad', 'i-pad', 'app store', 'itunes']
apple_regex = '|'.join(apple_words)  # creates the string 'apple|iphone|ipod|ipad|i-pad'

df.loc[(df['is_apple'].isna()) & df['tweet'].str.contains(apple_regex, case=False), 'is_apple'] = 1


# Assigning unknown tweet to company based on 'google_words'
google_words = ['google', 'android','pixel', 'google play']
google_regex = '|'.join(google_words)  # creates the string 'google|android|pixel|google play'

df.loc[(df['is_apple'].isna()) & df['tweet'].str.contains(google_regex, case=False), 'is_apple'] = 0

In [10]:
df['is_apple'].value_counts()

1    5567
0    2802
Name: is_apple, dtype: Int64