** Experiment 4 **

* Exploratory Data Analysis of the problem
* What is the nature of the data ?
* How is the data laid out ?
* Feature Generation Ideas

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import scipy as sp
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from natsort import natsorted

sns.set_style('dark')
pd.set_option('max_columns', None)

SEED = 131341
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/HotstarDataset.py
%run ../src/features/categorical_features.py

In [8]:
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')

dataset.load_json()\
       .encode_target()\
       .concat_data()\
       .save_data('../data/processed/hotstar_processed.feather')

<__main__.Hotstar at 0x7f35f3eed780>

In [9]:
data       = dataset.data
train_mask = dataset.get_train_mask() 

In [2]:
dataset = Hotstar('../data/raw/5f828822-4--4-hotstar_dataset/')
dataset.load_data('../data/processed/hotstar_processed.feather')

<__main__.Hotstar at 0x7f5905657dd8>

In [3]:
data       = dataset.data
train_mask = dataset.get_train_mask()

** Similarities between train and test set **

In [36]:
# cities involved in training set
cities_train = set([])
cities_test  = set([])

def add_to_train_set(cities):
    for city in cities.split(','):
        cities_train.add(city)

def add_to_test_set(cities):
    for city in cities.split(','):
        cities_test.add(city)

data.loc[train_mask, 'cities'].str.replace('[:](\d+)', '')\
    .map(add_to_train_set)
    
data.loc[~train_mask, 'cities'].str.replace('[:](\d+)', '')\
    .map(add_to_test_set);

In [37]:
print('Cities that are in training set but not in test set \n', cities_train - cities_test)

Cities that are in training set but not in test set 
 {'kokrajhar', 'bien hoa', 'tomelloso', 'ballerup', 'higashi', 'campulung', 'saskatoon', 'cancun', 'sehore', 'windhoek', 'satara', 'montfermeil', 'tumkur', 'kuala dungun', 'kuppam', 'poissy', 'silkeborg', 'weehawken', 'gent', 'tauranga', 'norwich', 'lake forest', 'vishakhapatnam', 'jind', 'albuquerque', 'phuket', 'tardeo', 'kota pinang', 'yojna vihar', 'lubumbashi', 'baton rouge', 'sunnyvale', 'vientiane', 'cachan', 'kuala pilah', 'small heath', 'university of new south wales', 'betul', 'ongole', 'waltham', 'grafton', 'sunshine', 'quanzhou', 'vilvoorde', 'timisoara', 'mahwa', 'aubonne', 'bedok', 'pali', 'semarang', 'ilford', 'mount gambier', 'logrono', 'fort-de-france', 'sungai petani', 'alexandria', 'racine', 'stapleford', "lower prince's quarter", 'exeter', 'west bromwich', 'yiwu', 'hahira', 'massy', 'jagadhri', 'rzeszow', 'bari', 'guayaquil', 'norrkoping', 'vittal nagar', 'thornhill', 'clichy', 'kuala belait', 'hampton', 'albury',

In [38]:
print('Cities that are in test set but not in training set \n', cities_test - cities_train)

Cities that are in test set but not in training set 
 {'baltimore', 'asker', 'pitesti', 'merrimack', 'beaverton', "qiu'aizhen", 'frankfurt-innenstadt', 'monterrey', 'baki', 'aabenraa', 'sarcelles', 'cagliari', 'merrifield', 'tehri', 'yuzhno-sakhalinsk', 'rewari', 'alor setar', 'jawahar nagar', 'veenendaal', 'matawan', 'frankfurt-griesheim', 'fuzhou', 'llantwit fardre', 'jalgaon', 'paradiso', 'pithoragarh', 'wageningen', 'tranbjerg', 'regina', 'nanaimo', 'pozuelo de alarcon', 'colpalombo', 'phaltan', 'zhengzhou', 'toluca de lerdo', 'saint peter port', 'pantin', 'roxas', 'pleasanton', 'bridgetown', 'irvine', 'krabi', 'midrand', 'quebec city', 'kosice', 'fort huachuca', 'joda', 'tampa', 'gaborone', 'de hoef', 'taiping', 'chelyabinsk', 'guetersloh', '`ajman', 'bloherfelde', 'oakville', 'rostov-na-donu', 'porbandar', 'bath', 'barnaul', 'bridgeton', 'carshalton', 'perris', 'cuxhaven', 'cuddalore', 'perugia', 'angamaly', 'cirebon', 'xiamen', 'panathur', 'belize city', 'son en breugel', 'dhang

In [39]:
# cities involved in training set
genres_train = set([])
genres_test  = set([])

def add_to_train_set(genres):
    for genre in genres.split(','):
        genres_train.add(genre)

def add_to_test_set(genres):
    for genre in genres.split(','):
        genres_test.add(genre)

data.loc[train_mask, 'genres'].str.replace('[:](\d+)', '')\
    .map(add_to_train_set)
    
data.loc[~train_mask, 'genres'].str.replace('[:](\d+)', '')\
    .map(add_to_test_set);

In [44]:
print('Genres that are in training but not in test ', genres_train - genres_test)

Genres that are in training but not in test  set()


In [45]:
print('Genres that are in the test set but not in training set ', genres_test - genres_train)

Genres that are in the test set but not in training set  set()


In [25]:
def get_cities_ohe(cities):
    return cities.str.replace('[:](\d+)', '').str.get_dummies(sep=',')

def get_titles_ohe(titles):
    return titles.str.replace('[:](\d+)', '').str.get_dummies(sep=',')

def get_genres_ohe(genres):
    return genres.str.replace('[:](\d+)', '').str.get_dummies(sep=',')

In [4]:
# cities
city_names  = data.cities.str.replace('[:](\d+)', '')\
                 .map(lambda x: ','.join(natsorted(x.split(','))))

# genres
genre_names = data.genres.str.replace('[:](\d+)', '')\
                  .map(lambda x: ','.join(natsorted(x.split(','))))

data = data.assign(city_names=city_names)
data = data.assign(genre_names=genre_names)

In [None]:
new_train, new_test = woe(data.loc[train_mask, ['city_names', 'genre_names']],
                          data.loc[~train_mask, ['city_names', 'genre_names']],
                          data.loc[train_mask, 'segment'],
                          features=['city_names', 'genre_names']
                         )

In [5]:
data

Unnamed: 0,ID,cities,dow,genres,segment,titles,tod,city_names,genre_names
0,train-1,"gurgaon:55494,delhi:31892","1:3412,3:15878,2:1737,5:10975,4:20974,7:17820,...","Cricket:82379,Kabaddi:255,Reality:4751",0.0,"Top Raids: Haryana vs Services SCB:103,Day 4: ...","10:26,13:331,12:323,20:21864,21:16233,17:7953,...","delhi,gurgaon","Cricket,Kabaddi,Reality"
1,train-10,"delhi:5862,nagar:8916,mumbai:1593","1:5745,3:3025,2:3346,5:123,4:3007,7:1108,6:10","Cricket:15640,Wildlife:730",0.0,"Dhoni Quits Captaincy:148,Day 4: India Move in...","11:1661,10:384,20:401,21:798,22:221,16:525,19:...","delhi,mumbai,nagar","Cricket,Wildlife"
2,train-100,navi mumbai:4142,3:4142,"LiveTV:13,Football:4129",0.0,"Star Sports 4:13,Manchester United vs Everton:...","1:1207,0:2406,2:529",navi mumbai,"Football,LiveTV"
3,train-1000,"new delhi:4131,chennai:2878,navi mumbai:1339","1:658,3:5867,5:413,4:1339,7:71","TalkShow:658,Cricket:7690",0.0,"SRH vs RCB:701,KKR vs KXIP:1042,MI vs SRH:2288...","11:71,20:2417,21:1042,23:2288,19:1872,8:658","chennai,navi mumbai,new delhi","Cricket,TalkShow"
4,train-10000,"gurgaon:6077,chennai:4055","1:1641,2:480,4:1445,7:1663,6:4900","Drama:5503,Cricket:3283,Reality:1345",0.0,"MI vs KKR:304,Yeh Rishta Kya Kehlata Hai:5449,...","20:158,22:4139,17:67,23:1510,19:288,18:56,0:23...","chennai,gurgaon","Cricket,Drama,Reality"
5,train-100000,"hyderabad:998,bangalore:2748,gulbarga:43317,be...","1:6707,3:1948,2:3574,5:8525,4:18938,7:8295,6:7344","Action:998,Drama:8795,Cricket:45541",0.0,"India vs Australia 2nd Test English:2836,SRH v...","11:3450,10:1243,13:4420,12:4210,20:7050,21:770...","bangalore,bengaluru,gulbarga,hyderabad","Action,Cricket,Drama"
6,train-100001,navi mumbai:10155,"1:1575,3:5330,2:1242,4:2007","Action:963,TalkShow:18,Romance:1357,Mythology:...",0.0,"Jodi:7222,Maapillai:1357,Mahabharatham:594,Ban...","11:20,12:574,21:1357,22:1066,23:2290,0:4847",navi mumbai,"Action,Mythology,Reality,Romance,TalkShow"
7,train-100002,"delhi:1571,navi mumbai:12729","1:333,2:2233,5:739,4:268,7:10727","Drama:4344,Cricket:9956",0.0,"Chandra Nandni:4344,India vs England 2nd T20I ...","11:242,15:419,14:1877,22:309,19:3063,18:64,1:2...","delhi,navi mumbai","Cricket,Drama"
8,train-100003,delhi:1318,"2:34,5:1074,7:210","Cricket:1248,Comedy:70",0.0,"India vs Bangladesh Day 2 English:1066,Fielder...","10:844,20:65,17:69,23:78,19:40,9:222",delhi,"Comedy,Cricket"
9,train-100004,"chandigarh:2214,delhi:3829,mumbai:9465","5:14,4:14292,7:1201","Action:86,Drama:4826,Cricket:10557,Kids:24,Tal...",0.0,"The Jungle Book:24,Jolly LLB:4826,Escape Plan:...","11:1,13:1610,12:2626,20:2325,21:2985,17:1252,1...","chandigarh,delhi,mumbai","Action,Cricket,Drama,Kids,TalkShow"
