In [18]:
#Convert Grey -> Gray
import pandas as pd
import numpy as np
import pickle

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

from sklearn.preprocessing import MultiLabelBinarizer


init_notebook_mode(connected=True)

In [2]:
file_name = "iphone_df.csv"
iphone_df = pd.read_csv(file_name)

In [3]:
iphone_df.columns

Index(['Phone Model', 'Carrier', 'Condition', 'Color', 'Size',
       'Recently Sold Price'],
      dtype='object')

In [4]:
colors = iphone_df['Color'].value_counts()
list(colors)

[33, 27, 21, 17, 16, 8, 2]

In [23]:
data_color = [go.Bar(
            x=list(colors.index),
            y=list(colors)
    )]

iplot({'data':data_color, 'layout':{'title': 'Color Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [21]:
size = iphone_df['Size'].value_counts()
size_str = [str(i)+'GB' for i in size.index]
print(size_str)

['64GB', '16GB', '256GB', '128GB', '32GB']


In [25]:
data_size = [go.Bar(
            x=size_str,
            y=list(size)
    )]

iplot({'data':data_size, 'layout':{'title': 'Memory Size Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [8]:
condition = iphone_df['Condition'].value_counts()
print(condition)

Good            63
Mint            50
Fair             6
New (Resale)     5
Name: Condition, dtype: int64


In [28]:
data_condition = [go.Bar(
            x=list(condition.index),
            y=list(condition)
    )]

iplot({'data':data_condition, 'layout':{'title': 'Phone Condition Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

In [10]:
carrier = iphone_df['Carrier'].value_counts()
print(condition)

Good            63
Mint            50
Fair             6
New (Resale)     5
Name: Condition, dtype: int64


In [30]:
data_carrier = [go.Bar(
            x=list(carrier.index),
            y=list(carrier)
    )]

iplot({'data':data_carrier, 'layout':{'title': 'Phone Carrier Distribution',
                          'font': dict(size=16)}}, filename='basic-bar')

## One Hot Encoding All Categorical Variables

In [37]:
def bin_multi_label(dataframe,column_name):
    data_ = dataframe[column_name].value_counts()

    list_ = [[i] for i in list(dataframe[column_name])]

    one_hot = MultiLabelBinarizer()
    one_hot.fit([list(data_.index)]) 
    one_hot_data_= one_hot.transform(list_)

    print("Model Classes:", one_hot.classes_)
    model_df = pd.DataFrame(one_hot_data_,columns = one_hot.classes_)
    #print(model_df.head(10))
    print("Shape is:",model_df.shape)
    print("\n")
    return one_hot, model_df

In [38]:
mlb_model,model_df = bin_multi_label(iphone_df,'Phone Model')
mlb_color,color_df = bin_multi_label(iphone_df,'Color')
mlb_carrier,carrier_df = bin_multi_label(iphone_df,'Carrier')
mlb_condition,condition_df = bin_multi_label(iphone_df,'Condition')

Model Classes: ['apple-iphone-6s' 'apple-iphone-6s-plus' 'apple-iphone-7-a1660'
 'apple-iphone-7-plus-a1661' 'apple-iphone-8-a1863'
 'apple-iphone-8-plus-a1864' 'apple-iphone-se' 'apple-iphone-x-a1865']
Shape is: (124, 8)


Model Classes: ['Black' 'Gold' 'Gray' 'Grey' 'Red' 'Rose Gold' 'Silver']
Shape is: (124, 7)


Model Classes: ['att' 'sprint' 'tmobile' 'verizon']
Shape is: (124, 4)


Model Classes: ['Fair' 'Good' 'Mint' 'New (Resale)']
Shape is: (124, 4)




In [32]:
#model_df.reset_index(drop=True, inplace=True)
#carrier_df.reset_index(drop=True, inplace=True)
data_encoded = pd.concat([model_df,carrier_df,condition_df,color_df,iphone_df['Size'],iphone_df['Recently Sold Price']], axis=1,sort=False)
print(data_encoded.columns)
print(data_encoded.shape)

Index(['apple-iphone-6s', 'apple-iphone-6s-plus', 'apple-iphone-7-a1660',
       'apple-iphone-7-plus-a1661', 'apple-iphone-8-a1863',
       'apple-iphone-8-plus-a1864', 'apple-iphone-se', 'apple-iphone-x-a1865',
       'att', 'sprint', 'tmobile', 'verizon', 'Fair', 'Good', 'Mint',
       'New (Resale)', 'Black', 'Gold', 'Gray', 'Grey', 'Red', 'Rose Gold',
       'Silver', 'Size', 'Recently Sold Price'],
      dtype='object')
(124, 25)


In [15]:
#file_name = "data_encoded.csv"
#data_encoded.to_csv(file_name, encoding='utf-8', index=False)

In [16]:
def undo_mlb_transform(mlb, data_array):
    data_array = [[i] for i in data_array]
    undid = mlb.inverse_transform(np.transpose(data_array))
    return undid[0][0]

In [20]:
pickle.dump(mlb_model, open('mlb_model.sav', 'wb'))
pickle.dump(mlb_color, open('mlb_color.sav', 'wb'))
pickle.dump(mlb_carrier, open('mlb_carrier.sav', 'wb'))
pickle.dump(mlb_condition, open('mlb_condition.sav', 'wb'))