## Import required libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib as plt

## Load Dataset

In [64]:
training_data = pd.read_csv("data/train.csv")
testing_data = pd.read_csv("data/test_without_truth.csv")

## Analysing Training Data

In [65]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369920 entries, 0 to 369919
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   post_id    369920 non-null  int64  
 1   user_id    369920 non-null  int64  
 2   country    369920 non-null  int64  
 3   category   369920 non-null  object 
 4   #views     369885 non-null  float64
 5   #comments  359309 non-null  float64
 6   #likes     369920 non-null  int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 19.8+ MB


In [66]:
training_data.head()

Unnamed: 0,post_id,user_id,country,category,#views,#comments,#likes
0,141569,13588,13,gaming,2403572.0,5606.0,107865
1,278047,5601,13,gaming,1421234.0,7316.0,68570
2,278280,31823,13,gaming,1460690.0,9924.0,136786
3,161305,14424,13,gaming,1463710.0,1087.0,62590
4,340945,8532,13,gaming,2507066.0,2974.0,21325


In [67]:
training_data.isnull().sum()

post_id          0
user_id          0
country          0
category         0
#views          35
#comments    10611
#likes           0
dtype: int64

In [68]:
print("post_id", len(training_data.post_id.unique()))
print("user_id" ,len(training_data.user_id.unique()))
print("country", len(training_data.country.unique()))
print("category", len(training_data.category.unique()))

post_id 369920
user_id 38395
country 15
category 9


## Handling Missing Values

Missing values are present in
1. Views column
    - Option 1: Drop these rows
    - Option 2: No of views must be greater or equals to no of commments
    - Option 3: We can take average of no of views based on user_id, category and country 
2. Comments column
    - Option 1: Drop these rows
    - Option 2: Take avaerage of no of comments based on user_id, category and country    

In [69]:
# Drop rows which contain any missing value
training_data.dropna(axis = 0, how = "any", inplace = True)
training_data.reset_index(inplace=True, drop = True)

In [70]:
training_data = training_data.loc[:5000]

In [71]:
print(training_data.isnull().sum())
print(training_data.shape[0])

post_id      0
user_id      0
country      0
category     0
#views       0
#comments    0
#likes       0
dtype: int64
5001


In [72]:
training_data.head()

Unnamed: 0,post_id,user_id,country,category,#views,#comments,#likes
0,141569,13588,13,gaming,2403572.0,5606.0,107865
1,278047,5601,13,gaming,1421234.0,7316.0,68570
2,278280,31823,13,gaming,1460690.0,9924.0,136786
3,161305,14424,13,gaming,1463710.0,1087.0,62590
4,340945,8532,13,gaming,2507066.0,2974.0,21325


In [73]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   post_id    5001 non-null   int64  
 1   user_id    5001 non-null   int64  
 2   country    5001 non-null   int64  
 3   category   5001 non-null   object 
 4   #views     5001 non-null   float64
 5   #comments  5001 non-null   float64
 6   #likes     5001 non-null   int64  
dtypes: float64(2), int64(4), object(1)
memory usage: 273.6+ KB


## Data Normmalization

In [74]:
views = training_data['#views'].to_numpy()
comments = training_data['#comments'].to_numpy()
likes = training_data['#likes'].to_numpy()

max_views, min_views = max(views), min(views)
max_comments, min_comments = max(comments), min(comments)
max_likes, min_likes = max(likes), min(likes)

In [75]:
def normalise_views(num):
    return float(num - min_views)/float(max_views - min_views)

def normalise_comments(num):
    return float(num - min_comments)/float(max_comments - min_comments)

def normalise_likes(num):
    return float(num - min_likes)/float(max_likes - min_likes)

In [76]:
category_list = list(training_data.category.unique())
# country_list = list(training_data.country.unique())
country_list = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]

category_dict = {}

index = 0
for category in category_list:
    category_dict[category] = index
    index += 1

def get_category_val(cat):
    one_hot = np.zeros((len(category_list)), dtype = int)
    index = category_dict[cat]
    one_hot[index] = 1
    return one_hot

def get_country_enc(country):
    one_hot = np.zeros((len(country_list)), dtype = int)
    one_hot[int(country)] = 1
    return one_hot

In [77]:
for i in range(training_data.shape[0]):
    training_data.at[i, "norm_views"] = normalise_views(training_data[["#views"]].iloc[i].values[0])
    training_data.at[i, "norm_comments"] = normalise_comments(training_data[["#comments"]].iloc[i].values[0])
    training_data.at[i, "norm_likes"] = normalise_likes(training_data[["#likes"]].iloc[i].values[0])
    
    category_val = get_category_val(training_data[["category"]].iloc[i].values[0])
    training_data.at[i, "cat1"] = category_val[0]
    training_data.at[i, "cat2"] = category_val[1]
    training_data.at[i, "cat3"] = category_val[2]
    training_data.at[i, "cat4"] = category_val[3]
    training_data.at[i, "cat5"] = category_val[4]
    training_data.at[i, "cat6"] = category_val[5]
    training_data.at[i, "cat7"] = category_val[6]
    training_data.at[i, "cat8"] = category_val[7]
    training_data.at[i, "cat9"] = category_val[8]
    
    country_val = get_country_enc(training_data[["country"]].iloc[i].values[0])
    training_data.at[i, "count1"] = country_val[0]
    training_data.at[i, "count2"] = country_val[1]
    training_data.at[i, "count3"] = country_val[2]
    training_data.at[i, "count4"] = country_val[3]
    training_data.at[i, "count5"] = country_val[4]
    training_data.at[i, "count6"] = country_val[5]
    training_data.at[i, "count7"] = country_val[6]
    training_data.at[i, "count8"] = country_val[7]
    training_data.at[i, "count9"] = country_val[8]
    training_data.at[i, "count10"] = country_val[9]
    training_data.at[i, "count11"] = country_val[10]
    training_data.at[i, "count12"] = country_val[11]
    training_data.at[i, "count13"] = country_val[12]
    training_data.at[i, "count14"] = country_val[13]
    training_data.at[i, "count15"] = country_val[14]
    

## Features Generation

- No of average views in each category
- No of average comments in each category
- No of average views in each counntry
- No of average views in each category
- No of views
- No of comments
- No of average views of the user
- No of average comments of the user

In [78]:
user_list = list(training_data.user_id.unique())

category_avg_data = {}
for category in category_list:
    df = training_data[training_data['category'] == category]
    views = list(df['norm_views'].to_numpy())
    comments = list(df['norm_comments'].to_numpy())
    if len(views) > 0:
        avg_views = sum(views)/len(views)
    else:
        avg_views = 0
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    category_avg_data[category] = [avg_views, avg_comments]

country_avg_data = {}
for country in country_list:
    df = training_data[training_data['country'] == country]
    views = list(df['norm_views'].to_numpy())
    comments = list(df['norm_comments'].to_numpy())
    if len(views) > 0:
        avg_views = sum(views)/len(views)
    else:
        avg_views = 0
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    country_avg_data[country] = [avg_views, avg_comments]

user_avg_data = {}
for user in user_list:
    df = training_data[training_data['user_id'] == user]
    views = list(df['norm_views'].to_numpy())
    comments = list(df['norm_comments'].to_numpy())
    if len(views) > 0:
        avg_views = sum(views)/len(views)
    else:
        avg_views = 0
    if len(comments) > 0:
        avg_comments = sum(comments)/len(comments)
    else:
        avg_comments = 0
    user_avg_data[user] = [avg_views, avg_comments]

In [80]:
for i in range(training_data.shape[0]):
    training_data.at[i, 'user_avg_views'] = user_avg_data[training_data[["user_id"]].iloc[i].values[0]][0]
    training_data.at[i, 'user_avg_comments'] = user_avg_data[training_data[["user_id"]].iloc[i].values[0]][1]
    
    training_data.at[i, 'country_avg_views'] = country_avg_data[training_data[["country"]].iloc[i].values[0]][0]
    training_data.at[i, 'country_avg_comments'] = country_avg_data[training_data[["country"]].iloc[i].values[0]][1]
    
    training_data.at[i, 'category_avg_views'] = category_avg_data[training_data[["category"]].iloc[i].values[0]][0]
    training_data.at[i, 'category_avg_comments'] = category_avg_data[training_data[["category"]].iloc[i].values[0]][1]

In [81]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   post_id                5001 non-null   int64  
 1   user_id                5001 non-null   int64  
 2   country                5001 non-null   int64  
 3   category               5001 non-null   object 
 4   #views                 5001 non-null   float64
 5   #comments              5001 non-null   float64
 6   #likes                 5001 non-null   int64  
 7   norm_views             5001 non-null   float64
 8   norm_comments          5001 non-null   float64
 9   norm_likes             5001 non-null   float64
 10  cat1                   5001 non-null   float64
 11  cat2                   5001 non-null   float64
 12  cat3                   5001 non-null   float64
 13  cat4                   5001 non-null   float64
 14  cat5                   5001 non-null   float64
 15  cat6

In [82]:
training_data.to_csv("data/processed_data.csv")