In [None]:
#Importing
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import seaborn as sns
import math
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, hstack
import timeit

In [None]:
#Fetch from train and test
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

In [None]:
Train_df = train_df.copy()
Test_df = test_df.copy()

### Data Analysis

#### ID

In [None]:
#Checking whether train id is present in test id or not
print(set(train_df['ID']).intersection(set(test_df['ID'])))
print(set(test_df['ID']).intersection(set(train_df['ID'])))

###### ID's are not repeated so they are useless. they are required only for assigning

In [None]:
plt.figure(figsize=(15,6))
plt.scatter(train_df['ID'], train_df['Upvotes'])

In [None]:
print(len(train_df['ID']))
print(len(train_df['ID'].unique()))
#ID's are not repeated. They are unique

#### Tag

In [None]:
print(train_df['Tag'].value_counts(normalize=True).sort_index())
print(test_df['Tag'].value_counts(normalize=True).sort_index())

###### Tags are Equally distributed so they are highly necessary

In [None]:
cat_features = []
cat_features.append(1)
#Label Encoding of Tag
lb_enc = LabelEncoder()
train_df['Tag'] = lb_enc.fit_transform(train_df['Tag'])
test_df['Tag'] = lb_enc.transform(test_df['Tag'])

In [None]:
train_df['Tag'].unique()

#### Reputation

In [None]:
#Less than one lakh
plt.figure(figsize=(15,10))
plt.scatter(train_df[train_df['Upvotes'] < 100000]['Reputation'], train_df[train_df['Upvotes'] < 100000]['Upvotes'])
plt.tick_params(colors='white')

In [None]:
#Greater than one lakh
plt.figure(figsize=(15,10))
plt.scatter(train_df[train_df['Upvotes'] >= 100000]['Reputation'], train_df[train_df['Upvotes'] >= 100000]['Upvotes'])
plt.tick_params(colors='white')

In [None]:
plt.figure(figsize=(15,10))
sns.distplot(train_df[train_df['Reputation'] < 20000] ['Reputation'],bins=50)#< 20000]['Reputation']
plt.tick_params(colors='white')
plt.show()

In [None]:
#The actual figure was exponential so used log curve. This will be helpful in linear or neural networks
plt.figure(figsize=(15,10))
sns.distplot(train_df[train_df['Reputation'] > 0] ['Reputation'].apply(np.log),bins=50)#< 20000]['Reputation']
plt.tick_params(colors='white')
plt.show()

###### Reputation of question author has an impact on upvotes. And they are related to username also

#### Answers   - Try with  numeric

In [None]:
print(train_df['Answers'].value_counts(normalize=True))
print(test_df['Answers'].value_counts(normalize=True))

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='Answers',y='Upvotes',data=train_df)
plt.tick_params(colors='white')
plt.xticks(rotation=90)

In [None]:
print(len(train_df['Answers']))
print(len(train_df['Answers'].unique()))

#### Username - Category Must

In [None]:
print(train_df['Username'].value_counts(normalize=True))
print(test_df['Username'].value_counts(normalize=True))

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(train_df['Username'], train_df['Upvotes'])
plt.tick_params(colors='white')
plt.xticks(rotation=90)

In [None]:
print(len(train_df['Username']))
print(len(train_df['Username'].unique()))

#### Views

In [None]:
print(train_df['Views'].value_counts(normalize=True))
print(test_df['Views'].value_counts(normalize=True))

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(train_df[train_df['Views'] < 2000000]['Views'], train_df[train_df['Views'] < 2000000]['Upvotes'])
plt.tick_params(colors='white')
plt.xticks(rotation=90)

In [None]:
print(len(train_df['Views']))
print(len(train_df['Views'].unique()))

###### Let Views be Numeric itself

### Model

######  Variables
1. ID ->  Don't use
2. Tag -> Category
3. Reputation -> Numeric
4. Answers -> numeric
5. Username -> Category
6. Views -> Numeric


In [None]:
train_df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
one_enc = OneHotEncoder(drop='first')
encoded_username = one_enc.fit_transform(train_df[['Username']])#.toarray().reshape(-1,1))
encoded_username

In [None]:
print(type(encoded_username))

In [None]:
#Joining two sparse matrices
#coo_matrix() => converts normal array to COO format
train_df_encoded_sparse = hstack([coo_matrix(train_df.drop(['ID', 'Username', 'Upvotes'], axis = 1).values), encoded_username])
train_df_encoded_sparse

In [None]:
#XGBoost
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
regressor = XGBRegressor()
scores = cross_val_score(regressor,
                         train_df_encoded_sparse,
                         train_df['Upvotes'], cv = 4, scoring='neg_mean_squared_error')
print(scores)
# regressor.fit()
print(np.sqrt(-scores.mean()))

In [None]:
print(regressor.n_estimators)
print(regressor.verbosity)
print(regressor.learning_rate)
print(regressor.tree_method)
print(regressor.reg_alpha)
print(regressor.reg_lambda)
print(regressor.__dict__)

### Grid Search for n_estimators

In [None]:
X_train = train_df_encoded_sparse.copy()
y_train = train_df['Upvotes']

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators' : [300,500,700,900]}
grid_cv_1 = GridSearchCV(regressor, param_grid, cv = 5, scoring = 'neg_mean_squared_error')
t1 = timeit.default_timer()
grid_cv_1.fit(X_train, y_train)
print("Time taken : ", timeit.default_timer() - t1 )
print("Score : ", grid_cv_1.best_score_)
print("Params : ", grid_cv_1.best_params_)
print("Results : ", grid_cv_1.cv_results_)