In [1]:
import pandas as pd
import numpy as np
import pickle

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

from sklearn.preprocessing import MultiLabelBinarizer


init_notebook_mode(connected=True)

In [2]:
file_name = "iphone_df_all.csv"
iphone_df = pd.read_csv(file_name)
iphone_df = iphone_df.drop_duplicates()

In [3]:
iphone_df.columns

Index(['Phone Model', 'Carrier', 'Condition', 'Color', 'Size',
       'Recently Sold Price', 'Date'],
      dtype='object')

Scraped data on base price from wikipedia, then manually created a dict (turns out it doesn't help much)

In [4]:
base_price_new = {'apple-iphone-6s':649, 'apple-iphone-6s-plus':749, 'apple-iphone-7-a1660':649,
       'apple-iphone-7-plus-a1661':769, 'apple-iphone-8-a1863':699,
       'apple-iphone-8-plus-a1864':799, 'apple-iphone-se':399, 'apple-iphone-x-a1865':999}

In [5]:
iphone_df['Base Price When New'] = iphone_df['Phone Model'].map(base_price_new);

In [6]:
def generate_traces(dataframe,col):
    unique_elem = dataframe[col].unique()
    traces = []
    for uni in unique_elem:
        traces.append((list(iphone_df[iphone_df[col] == uni]['Recently Sold Price']),uni))
    return traces

In [7]:
iphone_df.columns

Index(['Phone Model', 'Carrier', 'Condition', 'Color', 'Size',
       'Recently Sold Price', 'Date', 'Base Price When New'],
      dtype='object')

In [8]:
iphone_df['Color'] = iphone_df['Color'].replace('Gray', 'Grey')

In [9]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

#Check price distributions
values_dist = iphone_df['Recently Sold Price']
values_dist = sorted(list(values_dist))

trace1 = go.Scatter(
        x=list(range(0,len(values_dist))),
        y=values_dist,
        mode='markers',
        ) 
layout = {
    
    'title':'Sorted iPhone Prices',
    'xaxis' : dict(title = 'Array Index'),
    'yaxis' : dict(title = 'Sold Price'),
    'showlegend': False
}
data = [trace1]

iplot({'data':data, 'layout':layout},filename='basic')

There is a gap in data, between $400-500, not highly represented. Gives rise to a bimodal looking residual plot later in the visualizations.

## Phone Model Visualization

In [10]:
models = iphone_df['Phone Model'].value_counts()
list(models)

[76, 64, 55, 51, 41, 37, 12, 11]

In [11]:
data_models = [go.Bar(
            x=list(models.index),
            y=list(models)
    )]

iplot({'data':data_models, 'layout':{'title': 'Phone Model Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [12]:
data_box_model = []
traces_models = generate_traces(iphone_df,'Phone Model')

for trace in traces_models:
    data_box_model.append(go.Box( y=trace[0],name='{}'.format(trace[1])))
    
se = data_box_model.pop(1)
data_box_model.append(se)

    
iplot({'data':data_box_model, 'layout':{'title': 'Model Size Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

## Phone Color Visualization

In [13]:
colors = iphone_df['Color'].value_counts()
list(colors)

[162, 62, 59, 42, 14, 8]

In [14]:
data_color = [go.Bar(
            x=list(colors.index),
            y=list(colors)
    )]

iplot({'data':data_color, 'layout':{'title': 'Color Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [15]:
colors = iphone_df['Color'].unique()

In [16]:
data_box_color = []
traces_colors = generate_traces(iphone_df,'Color')

for trace in traces_colors:
    data_box_color.append(go.Box( y=trace[0],name='{}'.format(trace[1])))
    
iplot({'data':data_box_color, 'layout':{'title': 'Color Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

## Phone Memory Size Visualization

In [17]:
size = iphone_df['Size'].value_counts()
size_str = [str(i)+'GB' for i in size.index]
print(size)

64     164
256     52
16      48
32      45
128     38
Name: Size, dtype: int64


In [18]:
data_size = [go.Bar(
            x=size_str,
            y=list(size)
    )]

iplot({'data':data_size, 'layout':{'title': 'Memory Size Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [19]:
data_box_size = []
traces_sizes = generate_traces(iphone_df,'Size')

for trace in traces_sizes:
    data_box_size.append(go.Box( y=trace[0],name='{}GB'.format(trace[1])))
gb_128 = data_box_size.pop()
data_box_size.insert(1,gb_128)
iplot({'data':data_box_size, 'layout':{'title': 'Memory Size Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

## Phone Condition Visualization

In [20]:
condition = iphone_df['Condition'].value_counts()
print(condition)

Good            194
Mint            105
Fair             25
New (Resale)     23
Name: Condition, dtype: int64


In [21]:
data_condition = [go.Bar(
            x=list(condition.index),
            y=list(condition)
    )]

iplot({'data':data_condition, 'layout':{'title': 'Phone Condition Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [22]:
data_box_condition = []
traces_condition = generate_traces(iphone_df,'Condition')

for trace in traces_condition:
    data_box_condition.append(go.Box( y=trace[0],name='{}'.format(trace[1])))
    
iplot({'data':data_box_condition, 'layout':{'title': 'Phone Condition Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

## Phone Carrier Visualization

In [23]:
carrier = iphone_df['Carrier'].value_counts()
print(condition)

Good            194
Mint            105
Fair             25
New (Resale)     23
Name: Condition, dtype: int64


In [24]:
data_carrier = [go.Bar(
            x=list(carrier.index),
            y=list(carrier)
    )]

iplot({'data':data_carrier, 'layout':{'title': 'Phone Carrier Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [25]:
data_box_carrier = []
traces_carriers = generate_traces(iphone_df,'Carrier')

for trace in traces_carriers:
    data_box_carrier.append(go.Box( y=trace[0],name='{}'.format(trace[1])))
    
iplot({'data':data_box_carrier, 'layout':{'title': 'Phone Carrier Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

# Cleaning Data

In [26]:
print(iphone_df.shape)

(347, 8)


Removing Outliers

In [27]:
#Size Outliers
iphone_df = iphone_df[(iphone_df['Recently Sold Price']<414) | (iphone_df['Size']!=32)]
iphone_df = iphone_df[(iphone_df['Recently Sold Price']<449) | (iphone_df['Size']!=128)]

#Condition Outliers
iphone_df = iphone_df[(iphone_df['Recently Sold Price']<559) | (iphone_df['Condition']!='Fair')]

#Model Outliers
iphone_df = iphone_df[(iphone_df['Recently Sold Price']!=399) | (iphone_df['Phone Model']!='apple-iphone-6s-plus')]
iphone_df = iphone_df[(iphone_df['Recently Sold Price']!=879) | (iphone_df['Phone Model']!='apple-iphone-x-a1865')]


Removing Useless Columns

In [28]:
iphone_df.drop(columns=['Color','Carrier','Base Price When New'],inplace=True)

In [29]:
iphone_df.reset_index(inplace=True);

In [30]:
print(iphone_df.columns)

Index(['index', 'Phone Model', 'Condition', 'Size', 'Recently Sold Price',
       'Date'],
      dtype='object')


## One Hot Encoding All Categorical Variables

In [31]:
def bin_multi_label(dataframe,column_name):
    data_ = dataframe[column_name].value_counts()

    list_ = [[i] for i in list(dataframe[column_name])]

    one_hot = MultiLabelBinarizer()
    one_hot.fit([list(data_.index)]) 
    one_hot_data_= one_hot.transform(list_)

    print("Model Classes:", one_hot.classes_)
    model_df = pd.DataFrame(one_hot_data_,columns = one_hot.classes_)
    print("Shape is:",model_df.shape)
    print("\n")
    return one_hot, model_df

In [32]:
mlb_model,model_df = bin_multi_label(iphone_df,'Phone Model')
mlb_condition,condition_df = bin_multi_label(iphone_df,'Condition')

Model Classes: ['apple-iphone-6s' 'apple-iphone-6s-plus' 'apple-iphone-7-a1660'
 'apple-iphone-7-plus-a1661' 'apple-iphone-8-a1863'
 'apple-iphone-8-plus-a1864' 'apple-iphone-se' 'apple-iphone-x-a1865']
Shape is: (338, 8)


Model Classes: ['Fair' 'Good' 'Mint' 'New (Resale)']
Shape is: (338, 4)




In [33]:
model_df.drop(columns=['apple-iphone-7-plus-a1661'],inplace=True)
condition_df.drop(columns=['Mint'],inplace=True)

In [34]:
data_encoded = pd.concat([model_df,condition_df,iphone_df['Size'],iphone_df['Recently Sold Price']], axis=1,sort=False)
print(data_encoded.columns)
print(data_encoded.shape)

Index(['apple-iphone-6s', 'apple-iphone-6s-plus', 'apple-iphone-7-a1660',
       'apple-iphone-8-a1863', 'apple-iphone-8-plus-a1864', 'apple-iphone-se',
       'apple-iphone-x-a1865', 'Fair', 'Good', 'New (Resale)', 'Size',
       'Recently Sold Price'],
      dtype='object')
(338, 12)


In [35]:
file_name = "data_encoded_all.csv"
data_encoded.to_csv(file_name, encoding='utf-8', index=False)

In [36]:
def undo_mlb_transform(mlb, data_array):
    data_array = [[i] for i in data_array]
    undid = mlb.inverse_transform(np.transpose(data_array))
    return undid[0][0]

In [37]:
pickle.dump(mlb_model, open('mlb_model.sav', 'wb'))
pickle.dump(mlb_condition, open('mlb_condition.sav', 'wb'))