In [None]:
import pandas as pd
import numpy as np

from collections import Counter

from sklearn.feature_extraction import stop_words
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler, Normalizer
from yellowbrick.features import Rank2D


In [None]:
stories = pd.read_json('/home/katharine/Downloads/datasets/stories-20170501.json')

### Data Quality Check

In [None]:
stories.head()

In [None]:
stories.shape

In [None]:
stories.dtypes

In [None]:
stories.dropna().shape

In [None]:
for col in stories.columns:
    num_nans = stories[col].isnull().sum()
    print('{}\tnans:{}'.format(col, num_nans))

### Adding Features

In [None]:
stories = stories.set_index('short_id')

In [None]:
user_df = stories['submitter_user'].apply(pd.Series)
user_df = user_df.rename(columns={'created_at': 
                                  'user_created_at'})
stories = pd.concat([stories.drop(['submitter_user'], axis=1), 
                     user_df], axis=1)

In [None]:
tag_df = stories.tags.apply(pd.Series)
tag_df = pd.get_dummies(
    tag_df.apply(pd.Series).stack()).sum(level=0)
stories = pd.concat([stories, 
                     tag_df], axis=1)

In [None]:
stories['created_hour'] = stories.created_at.map(
    lambda x: x.hour)
stories['created_dow'] = stories.created_at.map(
    lambda x: x.weekday())
stories['created_year'] = stories.created_at.map(
    lambda x: x.year)

In [None]:
stories.head()

In [None]:
stories.columns.values

### Correlation investigation

In [None]:
stories[['karma', 'score']].corr()

In [None]:
stories[['created_year', 'score']].corr()

In [None]:
%matplotlib inline
stories.groupby('created_hour').mean()['score'].plot()

In [None]:
%matplotlib inline
stories[['crypto', 'hardware', 
         'openbsd', 'security']].sum().plot(kind='bar')

### Your Turn

- Investigate another few features by plotting with groupby or aggregations. You can also use Yellowbrick 2D with the numeric columns. 
- Did you find anything interesting? Share your findings in our discussion and Slack chat this week!

## Creating more features (str to numeric)

In [None]:
print('\n'.join(
    stories.select_dtypes(['object']).columns.values))

In [None]:
stop_words.ENGLISH_STOP_WORDS

In [None]:
stories.title.values.ravel()

In [None]:
import re

def clean_text(sentence):
    words = re.findall("\w+", sentence.lower())
    return [word for word in words if 
            word not in stop_words.ENGLISH_STOP_WORDS]

In [None]:
# %load ../solutions/new_clean_text.py


In [None]:
clean_text('I like watching TV News.')

In [None]:
all_words = [clean_text(sent) for sent in 
             stories.title.values.ravel()]

In [None]:
cntr = Counter(all_words)

In [None]:
all_words[:5]

In [None]:
import itertools
itertools.chain?

In [None]:
[f for f  in itertools.chain(*all_words[:4])]

In [None]:
cntr = Counter(itertools.chain(*all_words))

In [None]:
cntr.most_common(25)

In [None]:
pd.Series(list(cntr.values())).hist(bins=100)

In [None]:
cntr['c']

In [None]:
def get_popularity(title):
    words = clean_text(title)
    value = sum(cntr[w] for w in words)
    return value

In [None]:
stories['title_popularity'] = stories.title.map(
    get_popularity)

In [None]:
stories['title_popularity'].hist(bins=100)

### Normalizing or Standardizing a feature

In [None]:
MinMaxScaler?

In [None]:
scaler = MinMaxScaler()
scaler.fit_transform(stories[['title_popularity']])


In [None]:
stories['scaled_title_popularity'] = scaler.fit_transform(
    stories[['title_popularity']])

In [None]:
stories['scaled_title_popularity'].hist(bins=100)

In [None]:
stories = stories.drop(['title_popularity'], axis=1)

## Scaling Target Variable

In [None]:
stories['score'].hist(bins=100)

In [None]:
first_qr, third_qr = stories['score'].quantile([.25,.75])
iqr = third_qr - first_qr

In [None]:
outliers = stories[
    (stories['score'] < first_qr - 1.5 * iqr) |
    (stories['score'] > third_qr + 1.5 * iqr)]

In [None]:
outliers.shape

In [None]:
outliers['score'].hist(bins=50)

In [None]:
# NOTE: I am making a determination to call a hard cutoff at 50
# due to the distribution of the story score; however, we should 
# note in the final report that stories can ahve a score up to 150, 
# just that this is rare. It might be worth investigating if this changed
# over time (i.e. scoring algo)

stories['score'] = stories['score'].map(
    lambda s: s if s <=50 else 50)

In [None]:
stories['score'].hist(bins=100)

In [None]:
stories.score.value_counts()[:10]

### Investigating other potential targets

In [None]:
stories['upvotes'].hist(bins=100)

In [None]:
stories['comment_count'].hist(bins=100)

### Your Turn

- Add at least one more feature to your dataset.
- It can be using any of the object columns, or scaling a numeric column
- Feel free to add *more* than one if you have some ideas!

### Saving numeric dataset

In [None]:
stories_numeric = stories.select_dtypes([np.number])

In [None]:
stories_numeric.shape

In [None]:
stories_numeric = stories_numeric.fillna(0)

In [None]:
stories_numeric.to_csv('../data/lobsters_numeric.csv')

In [None]:
!head ../data/lobsters_numeric.csv

In [None]:
stories.to_csv('../data/lobsters_full_2017_cleaned.csv')