## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

## Load Dataset

In [34]:
training_data = pd.read_csv("data/train.csv")

In [52]:
testing_data = pd.read_csv("data/test_without_truth.csv")

## Analysing Training Data

In [37]:
training_data.describe()

Unnamed: 0,post_id,user_id,country,#views,#comments,#likes
count,369920.0,369920.0,369920.0,369885.0,359309.0,369920.0
mean,264102.305696,21056.832826,6.700938,500111.4,2003.481,23105.87
std,152590.609962,12101.006391,4.424512,1623025.0,12098.24,86168.47
min,0.0,0.0,0.0,6113.0,0.0,1.0
25%,131912.75,10552.0,3.0,61366.0,211.0,2641.0
50%,264087.0,21224.0,6.0,157165.0,573.0,7012.0
75%,396315.25,31537.0,11.0,423921.0,1523.0,19285.0
max,528460.0,41773.0,14.0,177987200.0,1219455.0,6197312.0


In [38]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369920 entries, 0 to 369919
Data columns (total 7 columns):
post_id      369920 non-null int64
user_id      369920 non-null int64
country      369920 non-null int64
category     369920 non-null object
#views       369885 non-null float64
#comments    359309 non-null float64
#likes       369920 non-null int64
dtypes: float64(2), int64(4), object(1)
memory usage: 19.8+ MB


In [39]:
training_data.head()

Unnamed: 0,post_id,user_id,country,category,#views,#comments,#likes
0,141569,13588,13,gaming,2403572.0,5606.0,107865
1,278047,5601,13,gaming,1421234.0,7316.0,68570
2,278280,31823,13,gaming,1460690.0,9924.0,136786
3,161305,14424,13,gaming,1463710.0,1087.0,62590
4,340945,8532,13,gaming,2507066.0,2974.0,21325


In [40]:
training_data.isnull().sum()

post_id          0
user_id          0
country          0
category         0
#views          35
#comments    10611
#likes           0
dtype: int64

In [41]:
print("post_id", len(training_data.post_id.unique()))
print("user_id" ,len(training_data.user_id.unique()))
print("country", len(training_data.country.unique()))
print("category", len(training_data.category.unique()))

post_id 369920
user_id 38395
country 15
category 9


In [42]:
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158541 entries, 0 to 158540
Data columns (total 6 columns):
post_id      158541 non-null int64
user_id      158541 non-null int64
country      158541 non-null int64
category     158541 non-null object
#views       158527 non-null float64
#comments    153990 non-null float64
dtypes: float64(2), int64(3), object(1)
memory usage: 7.3+ MB


In [43]:
testing_data.isna().sum()

post_id         0
user_id         0
country         0
category        0
#views         14
#comments    4551
dtype: int64

## Handling Missing Values

Missing values are present in
1. Views column
    - Take #views equals to #commments
2. Comments column
    - Take avaerage of no of comments based on user_id    

### Views Column

In [50]:
def handle_views_col(df):
    print(df)
    df1 = df.dropna(axis = 0, how = "any")
    df1.reset_index(inplace = True)
    print(df1)
    df2 = df[df.isna().any(axis = 1)]
    df2.reset_index(inplace = True)
    print(df2)
    for i in range(df2.shape[0]):
        if df2[["#views"]].iloc[i].isna().values[0]:
            df2.at[i, "#views"] = df2[["#comments"]].iloc[i].values[0]
    df3 = pd.concat([df1, df2])
    return df3

In [58]:
df1 = training_data.dropna(axis = 0, how = "any")
df1.reset_index(inplace = True)
df2 = training_data[training_data.isna().any(axis = 1)]
df2.reset_index(inplace = True)
for i in range(df2.shape[0]):
    if df2[["#views"]].iloc[i].isna().values[0]:
        df2.at[i, "#views"] = df2[["#comments"]].iloc[i].values[0]
df3_view_train = pd.concat([df1, df2])

In [60]:
df3_view_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369920 entries, 0 to 10645
Data columns (total 8 columns):
index        369920 non-null int64
post_id      369920 non-null int64
user_id      369920 non-null int64
country      369920 non-null int64
category     369920 non-null object
#views       369920 non-null float64
#comments    359309 non-null float64
#likes       369920 non-null int64
dtypes: float64(2), int64(5), object(1)
memory usage: 25.4+ MB


In [55]:
df1 = testing_data.dropna(axis = 0, how = "any")
df1.reset_index(inplace = True)
# print(df1)
df2 = testing_data[testing_data.isna().any(axis = 1)]
df2.reset_index(inplace = True)
# print(df2)
for i in range(df2.shape[0]):
    if df2[["#views"]].iloc[i].isna().values[0]:
        df2.at[i, "#views"] = df2[["#comments"]].iloc[i].values[0]
df3_view_test = pd.concat([df1, df2])

In [57]:
df3_view_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158541 entries, 0 to 4564
Data columns (total 7 columns):
index        158541 non-null int64
post_id      158541 non-null int64
user_id      158541 non-null int64
country      158541 non-null int64
category     158541 non-null object
#views       158541 non-null float64
#comments    153990 non-null float64
dtypes: float64(2), int64(4), object(1)
memory usage: 9.7+ MB


### Comments Column - Training Set

In [61]:
training_data = df3_view_train[["post_id", "user_id", "country", "category", "#views", "#comments", "#likes"]]

In [62]:
df1 = training_data.dropna(axis = 0, how = "any")
df1.reset_index(inplace = True)

df2 = training_data[training_data.isna().any(axis = 1)]
df2.reset_index(inplace = True)

In [63]:
user_list = list(df1.user_id.unique())
user_avg_data = {}
for user in user_list:
    df_temp = df1[df1['user_id'] == user]
    comments = list(df_temp['#comments'].to_numpy())
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    user_avg_data[user] = avg_comments

In [64]:
for i in range(df2.shape[0]):
    user_id = df2[["user_id"]].iloc[i].values[0]
    if user_id in user_avg_data:
        df2.at[i, "#comments"] = user_avg_data[user_id]
    else:
        df2.at[i, "#comments"] = 0

In [66]:
df3_col_train = pd.concat([df1, df2])
df3_col_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 369920 entries, 0 to 10610
Data columns (total 8 columns):
index        369920 non-null int64
post_id      369920 non-null int64
user_id      369920 non-null int64
country      369920 non-null int64
category     369920 non-null object
#views       369920 non-null float64
#comments    369920 non-null float64
#likes       369920 non-null int64
dtypes: float64(2), int64(5), object(1)
memory usage: 25.4+ MB


In [67]:
training_data = df3_col_train[["post_id", "user_id", "country", "category", "#views", "#comments", "#likes"]]
training_data.isna().sum()

post_id      0
user_id      0
country      0
category     0
#views       0
#comments    0
#likes       0
dtype: int64

## Comments Column - testing set

In [68]:
testing_data = df3_view_test[["post_id", "user_id", "country", "category", "#views", "#comments"]]

In [70]:
df1 = testing_data.dropna(axis = 0, how = "any")
df1.reset_index(inplace = True)

df2 = testing_data[testing_data.isna().any(axis = 1)]
df2.reset_index(inplace = True)

In [71]:
for i in range(df2.shape[0]):
    user_id = df2[["user_id"]].iloc[i].values[0]
    if user_id in user_avg_data:
        df2.at[i, "#comments"] = user_avg_data[user_id]
    else:
        df2.at[i, "#comments"] = 0

In [72]:
df3_col_test = pd.concat([df1, df2])
df3_col_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158541 entries, 0 to 4550
Data columns (total 7 columns):
index        158541 non-null int64
post_id      158541 non-null int64
user_id      158541 non-null int64
country      158541 non-null int64
category     158541 non-null object
#views       158541 non-null float64
#comments    158541 non-null float64
dtypes: float64(2), int64(4), object(1)
memory usage: 9.7+ MB


In [73]:
testing_data = df3_col_test[["post_id", "user_id", "country", "category", "#views", "#comments"]]

In [74]:
testing_data.isna().sum()

post_id      0
user_id      0
country      0
category     0
#views       0
#comments    0
dtype: int64

In [75]:
training_data.to_csv("data/train_null_free.csv")

In [76]:
testing_data.to_csv("data/test_null_free.csv")

## Data Normmalization

### Normalization - Training

In [2]:
training_data = pd.read_csv("data/train_null_free.csv")
training_data.reset_index(inplace = True)

In [3]:
views = training_data['#views'].to_numpy()
comments = training_data['#comments'].to_numpy()
likes = training_data['#likes'].to_numpy()

max_views, min_views = max(views), min(views)
max_comments, min_comments = max(comments), min(comments)
max_likes, min_likes = max(likes), min(likes)

In [4]:
def normalise_views(num):
    return float(num - min_views)/float(max_views - min_views)

def normalise_comments(num):
    return float(num - min_comments)/float(max_comments - min_comments)

def normalise_likes(num):
    return float(num - min_likes)/float(max_likes - min_likes)

In [5]:
category_list = list(training_data.category.unique())
country_list = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]

category_dict = {}

index = 0
for category in category_list:
    category_dict[category] = index
    index += 1

def get_category_val(cat):
    one_hot = np.zeros((len(category_list)), dtype = int)
    index = category_dict[cat]
    one_hot[index] = 1
    return one_hot

def get_country_enc(country):
    one_hot = np.zeros((len(country_list)), dtype = int)
    one_hot[int(country)] = 1
    return one_hot

In [25]:
for i in range(training_data.shape[0]):
    training_data.at[i, "norm_views"] = normalise_views(training_data[["#views"]].iloc[i].values[0])
    training_data.at[i, "norm_comments"] = normalise_comments(training_data[["#comments"]].iloc[i].values[0])
    training_data.at[i, "norm_likes"] = normalise_likes(training_data[["#likes"]].iloc[i].values[0])
    
    category_val = get_category_val(training_data[["category"]].iloc[i].values[0])
    training_data.at[i, "cat1"] = category_val[0]
    training_data.at[i, "cat2"] = category_val[1]
    training_data.at[i, "cat3"] = category_val[2]
    training_data.at[i, "cat4"] = category_val[3]
    training_data.at[i, "cat5"] = category_val[4]
    training_data.at[i, "cat6"] = category_val[5]
    training_data.at[i, "cat7"] = category_val[6]
    training_data.at[i, "cat8"] = category_val[7]
    training_data.at[i, "cat9"] = category_val[8]
    
    country_val = get_country_enc(training_data[["country"]].iloc[i].values[0])
    training_data.at[i, "count1"] = country_val[0]
    training_data.at[i, "count2"] = country_val[1]
    training_data.at[i, "count3"] = country_val[2]
    training_data.at[i, "count4"] = country_val[3]
    training_data.at[i, "count5"] = country_val[4]
    training_data.at[i, "count6"] = country_val[5]
    training_data.at[i, "count7"] = country_val[6]
    training_data.at[i, "count8"] = country_val[7]
    training_data.at[i, "count9"] = country_val[8]
    training_data.at[i, "count10"] = country_val[9]
    training_data.at[i, "count11"] = country_val[10]
    training_data.at[i, "count12"] = country_val[11]
    training_data.at[i, "count13"] = country_val[12]
    training_data.at[i, "count14"] = country_val[13]
    training_data.at[i, "count15"] = country_val[14]
    
#     if i%5000 == 0: print(i)

In [7]:
training_data.to_csv("data/train_data_normalise.csv")

### Normalisation - Testing

In [8]:
testing_data = pd.read_csv("data/test_null_free.csv")
testing_data.reset_index(inplace = True)

In [26]:
for i in range(testing_data.shape[0]):
    testing_data.at[i, "norm_views"] = normalise_views(testing_data[["#views"]].iloc[i].values[0])
    testing_data.at[i, "norm_comments"] = normalise_comments(testing_data[["#comments"]].iloc[i].values[0])
    
    category_val = get_category_val(testing_data[["category"]].iloc[i].values[0])
    testing_data.at[i, "cat1"] = category_val[0]
    testing_data.at[i, "cat2"] = category_val[1]
    testing_data.at[i, "cat3"] = category_val[2]
    testing_data.at[i, "cat4"] = category_val[3]
    testing_data.at[i, "cat5"] = category_val[4]
    testing_data.at[i, "cat6"] = category_val[5]
    testing_data.at[i, "cat7"] = category_val[6]
    testing_data.at[i, "cat8"] = category_val[7]
    testing_data.at[i, "cat9"] = category_val[8]
    
    country_val = get_country_enc(testing_data[["country"]].iloc[i].values[0])
    testing_data.at[i, "count1"] = country_val[0]
    testing_data.at[i, "count2"] = country_val[1]
    testing_data.at[i, "count3"] = country_val[2]
    testing_data.at[i, "count4"] = country_val[3]
    testing_data.at[i, "count5"] = country_val[4]
    testing_data.at[i, "count6"] = country_val[5]
    testing_data.at[i, "count7"] = country_val[6]
    testing_data.at[i, "count8"] = country_val[7]
    testing_data.at[i, "count9"] = country_val[8]
    testing_data.at[i, "count10"] = country_val[9]
    testing_data.at[i, "count11"] = country_val[10]
    testing_data.at[i, "count12"] = country_val[11]
    testing_data.at[i, "count13"] = country_val[12]
    testing_data.at[i, "count14"] = country_val[13]
    testing_data.at[i, "count15"] = country_val[14]
    
#     if i%5000 == 0: print(i)

In [10]:
testing_data.to_csv("data/test_data_normalise.csv")

## Feature Engineering

- No of average views in each category
- No of average comments in each category
- No of average views in each conntry
- No of average views in each category
- No of views on a post
- No of comments on a post
- No of average views of the user
- No of average comments of the user

### Training Data - Features

In [11]:
training_data = pd.read_csv("data/train_data_normalise.csv")

In [12]:
category_avg_data = {}
for category in category_list:
    df = training_data[training_data['category'] == category]
    views = list(df['norm_views'].to_numpy())
    comments = list(df['norm_comments'].to_numpy())
    if len(views) > 0:
        avg_views = sum(views)/len(views)
    else:
        avg_views = 0
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    category_avg_data[category] = [avg_views, avg_comments]

country_avg_data = {}
for country in country_list:
    df = training_data[training_data['country'] == country]
    views = list(df['norm_views'].to_numpy())
    comments = list(df['norm_comments'].to_numpy())
    if len(views) > 0:
        avg_views = sum(views)/len(views)
    else:
        avg_views = 0
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    country_avg_data[country] = [avg_views, avg_comments]

In [13]:
user_list = list(training_data.user_id.unique())
user_avg_data = {}
for user in user_list:
    df = training_data[training_data['user_id'] == user]
    views = list(df['norm_views'].to_numpy())
    comments = list(df['norm_comments'].to_numpy())
    if len(views) > 0:
        avg_views = sum(views)/len(views)
    else:
        avg_views = 0
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    user_avg_data[user] = [avg_views, avg_comments]

In [27]:
for i in range(training_data.shape[0]):
    training_data.at[i, 'user_avg_views'] = user_avg_data[training_data[["user_id"]].iloc[i].values[0]][0]
    training_data.at[i, 'user_avg_comments'] = user_avg_data[training_data[["user_id"]].iloc[i].values[0]][1]
    
    training_data.at[i, 'country_avg_views'] = country_avg_data[training_data[["country"]].iloc[i].values[0]][0]
    training_data.at[i, 'country_avg_comments'] = country_avg_data[training_data[["country"]].iloc[i].values[0]][1]
    
    training_data.at[i, 'category_avg_views'] = category_avg_data[training_data[["category"]].iloc[i].values[0]][0]
    training_data.at[i, 'category_avg_comments'] = category_avg_data[training_data[["category"]].iloc[i].values[0]][1]
    
#     if i%5000 == 0: print(i)

In [20]:
df = training_data.drop(columns = ["Unnamed: 0", "index", "Unnamed: 0.1"])
df.to_csv("data/training_data_with_features.csv")

## Testing Data - Features

In [21]:
testing_data = pd.read_csv("data/test_data_normalise.csv")

In [22]:
testing_data.reset_index(inplace = True)

In [28]:
for i in range(testing_data.shape[0]):
    user_id = testing_data[["user_id"]].iloc[i].values[0]
    if user_id in user_avg_data:
        user_profile_data = user_avg_data[user_id]
    else:
        user_profile_data = [testing_data[["norm_views"]].iloc[i].values[0], testing_data[["norm_comments"]].iloc[i].values[0]]
        
    testing_data.at[i, 'user_avg_views'] = user_profile_data[0]
    testing_data.at[i, 'user_avg_comments'] = user_profile_data[1]
    
    testing_data.at[i, 'country_avg_views'] = country_avg_data[testing_data[["country"]].iloc[i].values[0]][0]
    testing_data.at[i, 'country_avg_comments'] = country_avg_data[testing_data[["country"]].iloc[i].values[0]][1]
    
    testing_data.at[i, 'category_avg_views'] = category_avg_data[testing_data[["category"]].iloc[i].values[0]][0]
    testing_data.at[i, 'category_avg_comments'] = category_avg_data[testing_data[["category"]].iloc[i].values[0]][1]
    
#     if i%5000 == 0: print(i)

In [24]:
df = testing_data.drop(columns = ["Unnamed: 0", "index", "Unnamed: 0.1"])
df.to_csv("data/testing_data_with_features.csv")

## Features Description

<b>#1 : Norm Views</b>
<br> 
min-max normalization of #views
<br> 
<b>#2 : Norm Comments</b>
<br> min-max normalisation of #comments
<br> 
<b>#3 : user_avg_views</b>
<br> Using the concept of user profiling, these are avg #views which a user got
<br> 
<b>#4 : user_avg_comments</b>
<br> Using the concept of user profiling, these are avg #comments which a user got
<br> 
<b>#5 : country_avg_views</b>
<br> Using the concept of country profiling, these are avg #views which are seen by users of a country
<br> 
<b>#6 : country_avg_comments</b>
<br> Using the concept of country profiling, these are avg #comments which are given by users of a country
<br> 
<b>#7 : category_avg_views</b>
<br> Using the concept of category profiling, these are avg #views which are seen by users of a particular category
<br> 
<b>#8 : category_avg_comments</b>
<br> Using the concept of category profiling, these are avg #comments which are given by users of a particular category
<br> 
<b>#9 - #17 : Cat1 - Cat9</b>
<br> One-hot encoding of category
<br> 
<b>#18 - #32 : Count1 - Count15</b>
<br> One-hot encoding of a country