# Data analysis

Sentiment:
* If sentiment is zero, then it means neutral sentiment.
* If sentiment is less than zero, then it means a negative sentiment.
* And if sentiment is more than zero, then it means a positive sentiment.

In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt 
plt.style.use('ggplot')
import pprint

# Connection to MongoDB
client = MongoClient()
db = client.twitter_data
col = db.twitter_data

# Fetch all the data
result = col.find()

# Create a Pandas dataframe
df = pd.DataFrame(list(result))
df.sentiment = df.sentiment.astype(float)
df.created_at = pd.to_datetime(df.created_at)

## Outliers removal

In [None]:
countries = ('AR', 'BO', 'CR', 'EC', 'ES', 'HN', 'PY', 'VE')
colors = {'AR': 'blue', 'BO': '#afb20c', 'CR': 'orange', 
          'EC': 'magenta', 'ES': 'red', 'HN': 'cyan', 
          'PY': 'green', 'VE': 'black'}

for country in countries:
    df_one_country = df[df.country_code == country]
    mean_country = df_one_country.sentiment.mean()
    std_country = df_one_country.sentiment.std()
    outliers = df_one_country[
        (df_one_country.sentiment < mean_country - 4 * std_country) | 
        (df_one_country.sentiment > mean_country + 4 * std_country)].id
    df = df[~df.id.isin(outliers)]

## Descriptive statistics 

We print the mean of sentiment per country of all time. And then all the rest of statistics related to each country.

In [None]:
df.groupby(['country_code'])['sentiment'].mean().plot(kind='bar', figsize=(8,8))

In [None]:
df.groupby(['country_code'])['sentiment'].describe()

Now, we show the density histogram of all countries.

In [None]:
countries = ('AR', 'BO', 'CR', 'EC', 'ES', 'HN', 'PY', 'VE')
colors = {'AR': 'blue', 'BO': '#afb20c', 'CR': 'orange', 
          'EC': 'magenta', 'ES': 'red', 'HN': 'cyan', 
          'PY': 'green', 'VE': 'black'}

import seaborn as sns
sns.set(color_codes=True)

for country in countries:
    df_one_contry = df[df.country_code == country]
    result = df_one_contry.sentiment
    sns.distplot(result, hist=False, color=colors.get(country), label=country)
    sns.plt.show()

## Level of confidence of the means

Here we are going to measure the level of confidence of the mean for every country.

In [None]:
import statsmodels.stats.api as sms


for country in countries:
    df_one_contry = df[df.country_code == country]
    result = df_one_contry.sentiment
    print(country, ': ', df_one_contry.sentiment.mean(), sms.DescrStatsW(result).tconfint_mean())

## General visualizations

The next picture shows the mean of all countries per day.

In [None]:
df_mean_all = pd.concat([df['created_at'].dt.month, df['created_at'].dt.day, df['sentiment']], axis=1, keys=['month', 'day', 'sentiment'])
df_mean_all.groupby(['month', 'day'])['sentiment'].mean().plot(figsize=(8,8))

In [None]:
df_mean_all.groupby(['month', 'day'])['sentiment'].std().plot(figsize=(8,8))

The next plot shows the mean per day and per country of all data.

In [None]:
for country in countries:
    df_one_contry = df[df.country_code == country]
    result = df_one_contry.groupby([df_one_contry.created_at.dt.month, df_one_contry.created_at.dt.day])['sentiment'].mean()
    result.plot(label=country, legend=True, figsize=(10,10), color=colors.get(country)).set_xlabel('month, day')
    plt.show()

The next plot shows the mean per day and per country, only tweets with negative or positive feelings.

In [None]:
for country in countries:
    df_one_contry = df[(df.country_code == country) & (df.sentiment != 0)]
    result = df_one_contry.groupby([df_one_contry.created_at.dt.day])['sentiment'].mean()
    result.plot(label=country, legend=True, figsize=(10,10), color=colors.get(country)).set_xlabel('day')

The next plot shows the mean per day and per country, with sentiment represented as 3 values (negative=-1, neutral=0, positive=+1).

In [None]:
for country in countries:
    df_one_contry = df[df.country_code == country]
    df_one_contry.loc[df.sentiment > 0, 'sentiment'] = 1
    df_one_contry.loc[df.sentiment < 0, 'sentiment'] = -1
    result = df_one_contry.groupby([df_one_contry.created_at.dt.day])['sentiment'].mean()
    result.plot(label=country, legend=True, figsize=(10,10), color=colors.get(country)).set_xlabel('day')

The same without neutral feelings.

In [None]:
for country in countries:
    df_one_contry = df[(df.country_code == country) & (df.sentiment != 0)]
    df_one_contry.loc[df.sentiment > 0, 'sentiment'] = 1
    df_one_contry.loc[df.sentiment < 0, 'sentiment'] = -1
    result = df_one_contry.groupby([df_one_contry.created_at.dt.day])['sentiment'].mean()
    result.plot(label=country, legend=True, figsize=(10,10), color=colors.get(country)).set_xlabel('day')

Now, we are going to visualize the data through histograms. One per country.

In [None]:
for country in countries:
    df_one_contry = df[df.country_code == country]
    df_one_contry.hist(column='sentiment', by='country_code', color=colors.get(country))