## Imports and Setup

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
import string

In [None]:
from collections import Counter
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

In [None]:
from src.paths import INTERIM_CORPUS

In [None]:
%config InlineBackend.figure_format = "retina"
%matplotlib inline

## Ingestion and Initial Inspection of the Data

### Number of files to collect

In [None]:
Counter(p.suffix for p in INTERIM_CORPUS.glob('*.csv'))

### Import

In [None]:
df = pd.concat([
    pd.read_csv(
        f, 
        encoding='utf-8',
        parse_dates=['parsed_created_at'])
    for f in INTERIM_CORPUS.glob('*.csv')
], ignore_index=True)  # otherwise, index is weird

### Data Inspection

In [None]:
# first five rows
df.head()

In [None]:
# last five rows
df.tail()

**Variables/Column names**

In [None]:
df.columns

**Variable/Column types**

In [None]:
df.dtypes

## Basic Stats

In [None]:
len(df.index)

### Tweet Types

In [None]:
types = df.tweet_type.value_counts()
print(types)

In [None]:
_ = sns.barplot(x=types.index, y=types.values, alpha=0.8)

### Tweet Types by Day

In [None]:
df.set_index('parsed_created_at', inplace=True)

In [None]:
df['day'] = df.index.date

In [None]:
daily_plot = df.groupby(['day', 'tweet_type']).size().unstack()

In [None]:
_ = daily_plot.plot(title='Daily Tweets by Type', logy=True)

### Users

In [None]:
# number of unique tweeters
df['user_screen_name'].nunique()

In [None]:
tweeps = (df.groupby(['user_screen_name'])
         .count()
         .reset_index())

In [None]:
tweeps = (tweeps
         .sort_values(by='text')
         .tail(10))

In [None]:
tweeps

In [None]:
x = tweeps['user_screen_name']
y = tweeps['text']
plt.xlabel('Twitter handle')
plt.ylabel('Number of Tweets')
plt.title('Most number of Tweets by user')
h=plt.bar(range(10), y, label='Most Tweets by user')
xticks_pos = [0.65*patch.get_width() + patch.get_xy()[0] for patch in h]
_ = plt.xticks(xticks_pos, x, ha='right', rotation=45)

### Hashtags

We'll get the hashtags from the text to make sure we get all of them, as the `hashtag` field isn't always reliable. 

In [None]:
hashtags = (df
            .text
            .str.extractall(r'(\#\w+)')[0]
            .str.lower()
            .value_counts()
           )


In [None]:
hashtags.nunique()

In [None]:
hashtags = hashtags[1:11,]

In [None]:
hashtags

In [None]:
_ = sns.barplot(x=hashtags.values, y=hashtags.index, alpha=0.8)

### Mentions

In [None]:
mentions = (df
            .text
            .str.extractall(r'(@[\w_]+)')[0]
            .str.lower()
            .value_counts()
)

In [None]:
mentions = mentions[:10,]

In [None]:
_ = sns.barplot(x=mentions.values, y=mentions.index, alpha=0.8)

### Wordcloud

I have my doubts about how informative wordclouds actually are, but maybe they work as art.

In [None]:
junk = re.compile("al|RT|\n|&.*?;|http[s](?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)*")
tweets = [junk.sub(" ", t) for t in df.text]

vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=.5)
tfv = vec.fit_transform(tweets)

terms = vec.get_feature_names()
wc = WordCloud(height=1000, width=1000, max_words=1000).generate(" ".join(terms))

plt.figure(figsize=(10, 10))
plt.imshow(wc)
plt.axis("off")
plt.show()

That included some non-English tweets, so let's correct that. 

In [None]:
english_df = df[df.lang == 'en']

In [None]:
junk = re.compile("al|RT|\n|&.*?;|http[s](?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)*")
tweets = [junk.sub(" ", t) for t in english_df.text]

vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=.5)
tfv = vec.fit_transform(tweets)

terms = vec.get_feature_names()
wc = WordCloud(height=1000, width=1000, max_words=1000).generate(" ".join(terms))

plt.figure(figsize=(10, 10))
plt.imshow(wc)
plt.axis("off")
plt.show()