In [1]:
#Convert Grey -> Gray
import pandas as pd
import numpy as np
import pickle

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

from sklearn.preprocessing import MultiLabelBinarizer


init_notebook_mode(connected=True)

In [2]:
file_name = "iphone_df.csv"
iphone_df = pd.read_csv(file_name)

In [3]:
def generate_traces(dataframe,col):
    unique_elem = dataframe[col].unique()
    traces = []
    for uni in unique_elem:
        traces.append(list(iphone_df[iphone_df[col] == uni]['Recently Sold Price']))
    return traces, unique_elem

In [4]:
iphone_df.columns

Index(['Phone Model', 'Carrier', 'Condition', 'Color', 'Size',
       'Recently Sold Price'],
      dtype='object')

In [5]:
models = iphone_df['Phone Model'].value_counts()
list(models)

[20, 20, 20, 20, 19, 15, 5, 5]

In [6]:
data_models = [go.Bar(
            x=list(models.index),
            y=list(models)
    )]

iplot({'data':data_models, 'layout':{'title': 'Phone Model Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [7]:
data_box_model = []
traces_models, unique_models = generate_traces(iphone_df,'Phone Model')
i = 0
for trace in traces_models:
    data_box_model.append(go.Box( y= trace,name=unique_models[i]))
    i+=1
iplot({'data':data_box_model, 'layout':{'title': 'Memory Size Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

In [8]:
colors = iphone_df['Color'].value_counts()
list(colors)

[33, 27, 21, 17, 16, 8, 2]

In [9]:
data_color = [go.Bar(
            x=list(colors.index),
            y=list(colors)
    )]

iplot({'data':data_color, 'layout':{'title': 'Color Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [10]:
colors = iphone_df['Color'].unique()

Are greys different? Do I clump wuth silver

In [11]:
data_box_color = []
traces_colors, unique_colors = generate_traces(iphone_df,'Color')
i = 0
for trace in traces_colors:
    data_box_color.append(go.Box( y= trace,name=unique_colors[i]))
    i+=1
iplot({'data':data_box_color, 'layout':{'title': 'Color Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

In [12]:
size = iphone_df['Size'].value_counts()
size_str = [str(i)+'GB' for i in size.index]
print(size_str)

['64GB', '16GB', '256GB', '128GB', '32GB']


In [13]:
data_size = [go.Bar(
            x=size_str,
            y=list(size)
    )]

iplot({'data':data_size, 'layout':{'title': 'Memory Size Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [14]:
data_box_size = []
traces_sizes, unique_sizes = generate_traces(iphone_df,'Size')
i = 0
for trace in traces_sizes:
    data_box_size.append(go.Box( y= trace,name=size_str[i]))
    i+=1
iplot({'data':data_box_size, 'layout':{'title': 'Memory Size Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

In [15]:
condition = iphone_df['Condition'].value_counts()
print(condition)

Good            63
Mint            50
Fair             6
New (Resale)     5
Name: Condition, dtype: int64


In [16]:
data_condition = [go.Bar(
            x=list(condition.index),
            y=list(condition)
    )]

iplot({'data':data_condition, 'layout':{'title': 'Phone Condition Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [17]:
data_box_condition = []
traces_condition, unique_conditions = generate_traces(iphone_df,'Condition')
i = 0
for trace in traces_condition:
    data_box_condition.append(go.Box( y= trace,name=unique_conditions[i]))
    i+=1
iplot({'data':data_box_condition, 'layout':{'title': 'Phone Condition Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

In [18]:
carrier = iphone_df['Carrier'].value_counts()
print(condition)

Good            63
Mint            50
Fair             6
New (Resale)     5
Name: Condition, dtype: int64


In [19]:
data_carrier = [go.Bar(
            x=list(carrier.index),
            y=list(carrier)
    )]

iplot({'data':data_carrier, 'layout':{'title': 'Phone Carrier Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [20]:
data_box_carrier = []
traces_carriers, unique_carriers = generate_traces(iphone_df,'Carrier')
i = 0
for trace in traces_carriers:
    data_box_carrier.append(go.Box( y= trace,name=unique_carriers[i]))
    i+=1
iplot({'data':data_box_carrier, 'layout':{'title': 'Phone Carrier Box Plots',
                          'font': dict(size=16)}}, filename='basic-bar')

## One Hot Encoding All Categorical Variables

In [21]:
def bin_multi_label(dataframe,column_name):
    data_ = dataframe[column_name].value_counts()

    list_ = [[i] for i in list(dataframe[column_name])]

    one_hot = MultiLabelBinarizer()
    one_hot.fit([list(data_.index)]) 
    one_hot_data_= one_hot.transform(list_)

    print("Model Classes:", one_hot.classes_)
    model_df = pd.DataFrame(one_hot_data_,columns = one_hot.classes_)
    #print(model_df.head(10))
    print("Shape is:",model_df.shape)
    print("\n")
    return one_hot, model_df

In [22]:
mlb_model,model_df = bin_multi_label(iphone_df,'Phone Model')
mlb_color,color_df = bin_multi_label(iphone_df,'Color')
mlb_carrier,carrier_df = bin_multi_label(iphone_df,'Carrier')
mlb_condition,condition_df = bin_multi_label(iphone_df,'Condition')

Model Classes: ['apple-iphone-6s' 'apple-iphone-6s-plus' 'apple-iphone-7-a1660'
 'apple-iphone-7-plus-a1661' 'apple-iphone-8-a1863'
 'apple-iphone-8-plus-a1864' 'apple-iphone-se' 'apple-iphone-x-a1865']
Shape is: (124, 8)


Model Classes: ['Black' 'Gold' 'Gray' 'Grey' 'Red' 'Rose Gold' 'Silver']
Shape is: (124, 7)


Model Classes: ['att' 'sprint' 'tmobile' 'verizon']
Shape is: (124, 4)


Model Classes: ['Fair' 'Good' 'Mint' 'New (Resale)']
Shape is: (124, 4)




In [23]:
#model_df.reset_index(drop=True, inplace=True)
#carrier_df.reset_index(drop=True, inplace=True)
data_encoded = pd.concat([model_df,carrier_df,condition_df,color_df,iphone_df['Size'],iphone_df['Recently Sold Price']], axis=1,sort=False)
print(data_encoded.columns)
print(data_encoded.shape)

Index(['apple-iphone-6s', 'apple-iphone-6s-plus', 'apple-iphone-7-a1660',
       'apple-iphone-7-plus-a1661', 'apple-iphone-8-a1863',
       'apple-iphone-8-plus-a1864', 'apple-iphone-se', 'apple-iphone-x-a1865',
       'att', 'sprint', 'tmobile', 'verizon', 'Fair', 'Good', 'Mint',
       'New (Resale)', 'Black', 'Gold', 'Gray', 'Grey', 'Red', 'Rose Gold',
       'Silver', 'Size', 'Recently Sold Price'],
      dtype='object')
(124, 25)


In [24]:
#file_name = "data_encoded.csv"
#data_encoded.to_csv(file_name, encoding='utf-8', index=False)

In [25]:
def undo_mlb_transform(mlb, data_array):
    data_array = [[i] for i in data_array]
    undid = mlb.inverse_transform(np.transpose(data_array))
    return undid[0][0]

In [26]:
pickle.dump(mlb_model, open('mlb_model.sav', 'wb'))
pickle.dump(mlb_color, open('mlb_color.sav', 'wb'))
pickle.dump(mlb_carrier, open('mlb_carrier.sav', 'wb'))
pickle.dump(mlb_condition, open('mlb_condition.sav', 'wb'))