In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
os.getcwd()

In [None]:
train = pd.read_csv('data/training.1600000.processed.noemoticon.csv',
                       encoding = "ISO-8859-1", engine='python',header=None,
                      names=['sentiment','tweet_id','date','flag','user','text'])

### Exploratory Data Analysis

Find the shape of dataset:

In [None]:
train.shape

**Looking at my target variable `sentiments`:**

In [None]:
train['sentiment'].value_counts()

It appears that negative tweet is given a label of 0 and positive tweet is given a label of 4. For simplicity, I will convert the binary classification to [0,1].

In [None]:
def convert_sent(x):
    if x == 4:
        return 1
    else:
        return -1

In [None]:
train['sentiment'] = train['sentiment'].apply(convert_sent)
train['sentiment']

In [None]:
# After conversion
train['sentiment'].value_counts()

**Looking at the tweet_id:** 

Hypothetically, the tweet id should be unique identifier of each tweet, so if we count the number of unique tweets, we should get back a value of 1600000.

In [None]:
len(train['tweet_id'].unique())

But appearly that is not the case... Perhaps there are duplicated rows in the dataset:

In [None]:
id_count = train['tweet_id'].value_counts()

In [None]:
# Looking at one of these duplicated id 1753678185  
pd.options.display.max_colwidth = 90
train[train['tweet_id'] == 1753678185]

In [None]:
train[train['tweet_id'] == 1984377787]

It appears they are indeed duplicate rows, with different lables of sentiments! I have decided to remove these rows from the datasets.

In [None]:
# Find row indexes where the tweed id has exactly one entry
id_count[id_count.values == 1].index
train = train[train['tweet_id'].isin(id_count[id_count.values == 1].index)].reset_index()
train.head()

In [None]:
# New shape
train.shape

**Looking at the `date` column:**

In [None]:
train['date'].dtypes

In [None]:
# Time Zone
area_code = train['date'].str[20:24]
area_code.value_counts()

It appears all of the time zones are recorded in Pacific Time.

In [None]:
# Year
year = train['date'].str[24:29]
year.value_counts()

All of the tweets are from 2009.

**Looking at the `flag` column:**

In [None]:
train['flag'].value_counts()

It appears all of the rows contain the value of `NO_QUERY`, hence this column would not bring any information to our model, hence I will be removing it.

In [None]:
train = train.drop(['flag'],axis=1)

In [None]:
train.shape

**Looking at the `user` column:**

In [None]:
train['user'].value_counts()

We have 659,775 unique users in this dataset from 2009.

**Looking at the `text` column:**

In [None]:
train['text'].value_counts().head()

In [None]:
# Find text columns with hashtags
train['text'].str.contains('#').value_counts()

In [None]:
# Find text columns with mentions
train['text'].str.contains('@').value_counts()

In [None]:
# Find text columns with urls
train['text'].str.contains('http://').value_counts()