In [1]:
import pandas as pd
import pandas_profiling as pp
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [4]:
data = pd.read_csv('data/predict-the-number-of-upvotes-a-post-will-get/train.csv')
data_test = pd.read_csv('data/predict-the-number-of-upvotes-a-post-will-get/test.csv')

In [5]:
data.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


In [6]:
data.describe()

Unnamed: 0,ID,Reputation,Answers,Username,Views,Upvotes
count,330045.0,330045.0,330045.0,330045.0,330045.0,330045.0
mean,235748.682789,7773.147,3.917672,81442.888803,29645.07,337.505358
std,136039.418471,27061.41,3.579515,49215.10073,80956.46,3592.441135
min,1.0,0.0,0.0,0.0,9.0,0.0
25%,117909.0,282.0,2.0,39808.0,2594.0,8.0
50%,235699.0,1236.0,3.0,79010.0,8954.0,28.0
75%,353620.0,5118.0,5.0,122559.0,26870.0,107.0
max,471493.0,1042428.0,76.0,175738.0,5231058.0,615278.0


In [None]:
pp.ProfileReport(data)

### Data Analysis

- Number of features are not very large.
- We have categorical data - Tag
- We have ID column , which may not be required as it just like an indexing.
- Numerical data is not normalized.
- No null value is present.
- Username can be duplicate , as same user can have multiple post, either remove it or have some conversion before use it.
- Upvote is highly skewed - may be be should bucketize it.
- Tag have 10 unique values
- Upvotes is highly dependent on View, reputation and Answers

### Data Visualization

In [None]:
## Upvote - how dependent on the tag
%matplotlib inline
sns.catplot(x="Tag", y="Upvotes", data=data);

In [None]:
sns.distplot(data.Upvotes, bins=50, kde=False, rug=True);


### Data Manipulation

- shuffle the data.
- remove id and username feature for the first cut.
- convert categorical feature tag into.
- normalize the data.

- can bucketize few features - Upvotes. 

### Predict Upvote - Regression Phase 1 

In [None]:
data = shuffle(data)

In [None]:
print('Coloumn Names ', data.columns)
features = ['Reputation', 'Answers', 'Views']
label = ['Upvotes']

In [None]:
def preprocess_data(data_set, features, labels, scaler= None, binarizer= None , is_test_data = False):
    
    if not is_test_data:
        binarizer = LabelEncoder()
        scaler = StandardScaler()
        binarizer.fit(data_set.Tag)
    
    X = np.concatenate((data_set[features].values, binarizer.transform(data_set.Tag)), axis = 1)
    y = data_set[labels].values
    if not is_test_data:
        scaler.fit(X)
    X = scaler.transform(X)    
    return X, y, scaler, binarizer

In [None]:
X, y, scaler, binarizer = preprocess_data(data_set=data, features=features, labels= label)

In [None]:
print('Shape of X', X.shape)
print('Shape of y', y.shape)

In [None]:
 X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
print('Shape of X train', X_train.shape)
print('Shape of y train', y_train.shape)

print('Shape of X val', X_val.shape)
print('Shape of y val', y_val.shape)

In [None]:
## Normal Equation Implementation


In [99]:
print('MAE: ', mean_absolute_error(y_pred, y_val))
print('MSE: ', mean_squared_error(y_pred, y_val))


MAE:  149.24299609170063
MSE:  1836724.9780551065
