#### Import necessary Libraries

In [None]:
import requests
import pandas as pd
import json
from random import randint
import numpy as np
import pickle
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


#### Load Data

In [None]:
phone_df = pd.read_csv('phone_df.csv')
phone_df.head()


In [None]:
phone_df.screen_size.unique()

In [None]:
phone_df.info()

In [None]:
phone_df.describe()

#### Exploratory Data Analysis

visualization to assess the relationship of independent features with the target variable Price

In [None]:
# scatter plot of brand and price
fig = px.scatter(phone_df, x='brand', y='price', opacity = 0.25, template = 'plotly_dark', 
                color='brand', hover_data=['brand', 'price'],title = 'PHONE PRICE PREDICTION (BRAND - PRICE RELATIONSHIP)')
fig.update_layout(width=1000, height=600)
fig.show()


visualization to assess the relationship of screen size with the target variable Price

In [None]:
# scatter plot of screen_size and price
fig = px.scatter(phone_df, x='screen_size', y='price', opacity = 0.25, template = 'plotly_dark', 
                color='screen_size', hover_data=['screen_size', 'price'],title = 'PHONE PRICE PREDICTION (SCREEN SIZE - PRICE RELATIONSHIP)')
fig.update_layout(width=1000, height=600)
fig.show()

visualization to assess the relationship of ram with the target variable Price

In [None]:
# scatter plot of ram and price
fig = px.scatter(phone_df, x='ram', y='price', opacity = 0.25, template = 'plotly_dark', 
                color='ram', hover_data=['ram', 'price'],title = 'PHONE PRICE PREDICTION (RAM - PRICE RELATIONSHIP)')
fig.update_layout(width=1000, height=600)
fig.show()

visualization to assess the relationship of rom with the target variable Price

In [None]:
# scatter plot of rom and price
fig = px.scatter(phone_df, x='rom', y='price', opacity = 0.25, template = 'plotly_dark', 
                color='rom', hover_data=['rom', 'price'],title = 'PHONE PRICE PREDICTION (ROM - PRICE RELATIONSHIP)')
fig.update_layout(width=1000, height=600)
fig.show()

visualization to assess the relationship of mp with the target variable Price

In [None]:
# scatter plot of mp and price
fig = px.scatter(phone_df, x='mp', y='price', opacity = 0.25, template = 'plotly_dark', 
                color='mp', hover_data=['mp', 'price'],title = 'PHONE PRICE PREDICTION (MEGA PIXELS - PRICE RELATIONSHIP)')
fig.update_layout(width=1000, height=600)
fig.show()

visualization to assess the relationship of mp with the target variable Price

In [None]:
# scatter plot of battery and price
fig = px.scatter(phone_df, x='battery', y='price', opacity = 0.25, template = 'plotly_dark', 
                color='battery', hover_data=['battery', 'price'],title = 'PHONE PRICE PREDICTION (BATTERY - PRICE RELATIONSHIP)')
fig.update_layout(width=1000, height=600)
fig.show()

#### Data Preprocessing

Distribution of the Target Variable

In [None]:
fig = px.histogram(phone_df, x=["price"], template = 'plotly_dark', title = 'Histogram of Price')
fig.update_layout(width=1000, height=600)
fig.show()

Log Transform the Target (price)

In [None]:
# create a copy of data
data_copy = phone_df.copy()
# create a new feature Log_Price
data_copy['Log_Price'] = np.log(phone_df['price'])
# plot histogram
fig = px.histogram(data_copy, x=["Log_Price"], title = 'Histgram of Log Price', template = 'plotly_dark')
fig.update_layout(width=1000, height=600)
fig.show()

Remove Outliers from the Target Column

In [None]:
# Create a box plot of the 'Target' column to identify outliers
fig = px.box(data_copy, y='Log_Price')
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# Calculate the upper limit of outliers as the 97.5th percentile of the target column
upper_limit = data_copy['Log_Price'].quantile(0.931)

# Identify the upper outliers
upper_outliers = data_copy[data_copy['Log_Price'] > upper_limit]

# Remove the upper outliers from the dataset
no_outlier = data_copy[~data_copy.index.isin(upper_outliers.index)]

# Create a box plot of the 'Target' column to identify outliers
fig = px.box(no_outlier, y='Log_Price')
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
# No Outliers
data = no_outlier
data = no_outlier.drop(['price'], axis=1)
data.head()

In [None]:
data.info()

visualization to assess outliers

In [None]:
# box Plot to visualize Outliers in the Feature Variables
for var in ['screen_size', 'ram', 'rom', 'mp', 'battery']:
    plt.figure(figsize=(14, 10))
    plt.subplot(1, 2, 1)
    fig = data.boxplot(column=var)
    fig.set_ylabel(var)

    plt.subplot(2, 2, 2)
    fig = data[var].hist()
    fig.set_xlabel(var)

    plt.show()
    

In [None]:
# Calculate the upper and lower limit of outliers of the screen_size column
upper_limit = data['screen_size'].quantile(0.89)
lower_limit = data['screen_size'].quantile(0.1)

# Identify the upper and lower outliers
upper_outliers = data[data['screen_size'] > upper_limit]
lower_outliers = data[data['screen_size'] < lower_limit]

# Remove the upper and lower outliers from the dataset
dataa = data[~data.index.isin(upper_outliers.index) & ~data.index.isin(lower_outliers.index) ]


# Calculate the upper and lower limit of outliers of the battery column
upper_limit = data['battery'].quantile(0.80)
lower_limit = data['battery'].quantile(0.2)

# Identify the upper and lower outliers
upper_outliers = data[data['battery'] > upper_limit]
lower_outliers = data[data['battery'] < lower_limit]

# Remove the upper and lower outliers from the dataset
dataaa = data[~data.index.isin(upper_outliers.index) & ~data.index.isin(lower_outliers.index) ]

# Create a box plot of the 'screen_size' column to identify outliers
for var in ['screen_size','battery']:
    if var == 'screen_size':
        plt.figure(figsize=(14, 10))
        plt.subplot(1, 2, 1)
        fig = dataa.boxplot(column=var)
        fig.set_ylabel(var)

    if var == 'battery':
        plt.subplot(2, 2, 2)
        fig = dataaa.boxplot(column=var)
        fig.set_ylabel(var)

        plt.show()

In [None]:
data['screen_size'] = dataa['screen_size']
data['battery'] = dataaa['battery']
data.rename(columns={'Log_Price': 'price'}, inplace=True)

data.head()

In [None]:
data.info()

In [None]:
data.fillna(data.mean(), inplace=True)
data.info()


In [None]:
for var in ['screen_size','battery']:
    if var == 'screen_size':
        plt.figure(figsize=(14, 10))
        plt.subplot(1, 2, 1)
        fig = data.boxplot(column=var)
        fig.set_ylabel(var)

    if var == 'battery':
        plt.subplot(2, 2, 2)
        fig = data.boxplot(column=var)
        fig.set_ylabel(var)

        plt.show()

Visualize Correlation of the feature columns

In [None]:
# plot the scatter matrix
sns.pairplot(data,  height=2, aspect=1)

# show the plot
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr().abs(),  annot=True)

#### Fiting models

In [None]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import os

X = data.drop(['price', 'battery'], axis=1)
y = data['price']

categorical_cols = [col_names for col_names in X.columns if X[col_names].nunique() < 10 and X[col_names].dtype == 'object']
numerical_cols = [col_names for col_names in X.columns if X[col_names].dtype in ['int64', 'float64']] 

# Processing of numerical data
numerical_transformer = StandardScaler()

# Processing of categorical data
categorical_transformer = OneHotEncoder(categories='auto')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

regular_reg = ElasticNetCV()
dt_reg = DecisionTreeRegressor(random_state=1)
bag_reg = BaggingRegressor(random_state=1)
ab_reg = AdaBoostRegressor(random_state=1)
kn_reg = KNeighborsRegressor()
ridge = Ridge(random_state=1)
l_reg = LinearRegression()
gb_reg = GradientBoostingRegressor(random_state=1)

models = {  'ElasticNetCV': regular_reg, 
            'DecisionTreeRegressor': dt_reg, 
            'BaggingRegressor': bag_reg, 
            'AdaBoostRegressor': ab_reg,
            'KNeighborsRegressor': kn_reg,
            'Ridge': ridge,
            'LinearRegression': l_reg,
            'GradientBoostingRegressor': gb_reg
        }

# Use the training and testing cross accuracy and divergence to identify the best model
def score_performance(models, X, y, process='Training'):
    print(f'Process; {process}')
    for model_name, model in models.items():
        # Using a pipline
        my_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])
        kf = KFold(n_splits=10)
        scores = cross_val_score(my_pipeline, X, y, cv=5)
        print(f'Model: {model_name}')
        print(f'Cross validation mean score: {round(np.mean(scores), 4)}')
        print(f'Cross validation std: {round(np.std(scores), 4)}')
        print('\n')

# train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
score_performance(models, X_train, y_train, process='Training')

In [None]:
np.random.seed(42)
gb_reg = GradientBoostingRegressor(random_state=1)

grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

rscv = RandomizedSearchCV(estimator=gb_reg,
                            param_distributions=grid,
                            n_iter=20, # try 20 models total
                            cv=5) # print out results

my_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', rscv)
        ])

# Fit the RandomizedSearchCV version of gb_reg
my_pipeline.fit(X_train, y_train)

In [None]:
# Create evaluation function (the competition uses RMSLE)
def rmsle(y_test, y_preds):
    """
    Caculates root mean squared log error between predictions and
    true labels.
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))
    
# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_test, val_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_test, val_preds),
              "Training R^2": r2_score(y_train, train_preds),
              "Valid R^2": r2_score(y_test, val_preds)}
    return scores

show_scores(my_pipeline)

In [None]:
# Check the best hyperparameters found with GridSearchCV
rscv.best_params_

In [None]:
ideal_model = GradientBoostingRegressor(n_estimators=100,
                            min_samples_split=2,
                            min_samples_leaf=2,
                            max_features='sqrt',
                            max_depth=None,
                            random_state=42)

# Using a pipline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', ideal_model)
])

# Fit the ideal model
my_pipeline.fit(X_train, y_train)

In [None]:
show_scores(my_pipeline)

In [None]:
my_pipeline.score(X_test, y_test)

#### Save the model

In [None]:
# Saving and loading a model with pickle

# Save an existing model to file
pickle.dump(my_pipeline, open("phone_price_predictor.pkl", "wb"))