In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import pickle
import datetime as date
import math

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [2]:
with open('data_selection','rb') as read_file:
    df = pickle.load(read_file)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2901 entries, 0 to 8608
Data columns (total 9 columns):
dimentions            2901 non-null object
weight_g              2901 non-null object
display_resolution    2901 non-null object
display_size          2901 non-null object
internal_memory       2901 non-null object
RAM                   2901 non-null object
primary_camera        2901 non-null object
battery               2901 non-null object
approx_price_EUR      2901 non-null float64
dtypes: float64(1), object(8)
memory usage: 226.6+ KB


In [9]:
df.head(3)

Unnamed: 0,dimensions,weight_g,display_resolution,display_size,internal_memory,RAM,primary_camera,battery,approx_price_EUR
0,191.7 x 101 x 9.4 mm (7.55 x 3.98 x 0.37 in),260.0,7.0 inches (~69.8% screen-to-body ratio),720 x 1280 pixels (~210 ppi pixel density),16/32 GB,2 GB RAM,13 MP| autofocus,Non-removable Li-Ion 3400 mAh battery (12.92 Wh),170.0
1,153.8 x 75.6 x 8.5 mm (6.06 x 2.98 x 0.33 in),169.0,5.5 inches (~71.7% screen-to-body ratio),1080 x 1920 pixels (~401 ppi pixel density),32 GB,3 GB RAM,13 MP| autofocus| LED flash,Removable Li-Po 4080 mAh battery,250.0
2,153.3 x 78.8 x 8.5 mm (6.04 x 3.10 x 0.33 in),166.0,5.5 inches (~69.0% screen-to-body ratio),720 x 1280 pixels (~267 ppi pixel density),32 GB,3 GB RAM,13 MP| f/1.8| autofocus| LED flash|,Removable Li-Po 4020 mAh battery,230.0


In [4]:
df.head(3)

Unnamed: 0,dimentions,weight_g,display_resolution,display_size,internal_memory,RAM,primary_camera,battery,approx_price_EUR
0,191.7 x 101 x 9.4 mm (7.55 x 3.98 x 0.37 in),260,7.0 inches (~69.8% screen-to-body ratio),720 x 1280 pixels (~210 ppi pixel density),16/32 GB,2 GB RAM,13 MP| autofocus,Non-removable Li-Ion 3400 mAh battery (12.92 Wh),170.0
1,153.8 x 75.6 x 8.5 mm (6.06 x 2.98 x 0.33 in),169,5.5 inches (~71.7% screen-to-body ratio),1080 x 1920 pixels (~401 ppi pixel density),32 GB,3 GB RAM,13 MP| autofocus| LED flash,Removable Li-Po 4080 mAh battery,250.0
4,153.3 x 78.8 x 8.5 mm (6.04 x 3.10 x 0.33 in),166,5.5 inches (~69.0% screen-to-body ratio),720 x 1280 pixels (~267 ppi pixel density),32 GB,3 GB RAM,13 MP| f/1.8| autofocus| LED flash|,Removable Li-Po 4020 mAh battery,230.0


In [5]:
df = df.rename(columns={'dimentions': 'dimensions'})

## Clean Weight

In [6]:
df.drop(df[df.weight_g == '225-236'].index, inplace=True)

In [7]:
#convert weight_g column to float

df['weight_g'] = df['weight_g'].astype(float)

In [8]:
#resets indices

df = df.reset_index()
df.drop(['index'], axis=1, inplace=True)

## Clean Screen Size

In [10]:
df_splitres = pd.DataFrame(df.display_resolution.str.split().tolist(), columns="display_res inches percent extra1 extra2 a b c d".split())

In [11]:
df_splitres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 9 columns):
display_res    2900 non-null object
inches         2900 non-null object
percent        2887 non-null object
extra1         2887 non-null object
extra2         2887 non-null object
a              40 non-null object
b              38 non-null object
c              38 non-null object
d              38 non-null object
dtypes: object(9)
memory usage: 204.0+ KB


In [12]:
#adding the resolution inch size to a column and dropping unneeded display_resolution column
df['new_res'] = df_splitres.display_res

In [13]:
df = df.drop(['display_resolution'], axis = 1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 9 columns):
dimensions          2900 non-null object
weight_g            2900 non-null float64
display_size        2900 non-null object
internal_memory     2900 non-null object
RAM                 2900 non-null object
primary_camera      2900 non-null object
battery             2900 non-null object
approx_price_EUR    2900 non-null float64
new_res             2900 non-null object
dtypes: float64(2), object(7)
memory usage: 204.0+ KB


In [15]:
df_cleanres = df.copy()
df_cleanres.drop(df_cleanres[df_cleanres.new_res == '|'].index, inplace=True)
df_cleanres['new_res'] = df_cleanres['new_res'].astype(float)

### Split Dimensions into 3 Columns

In [16]:
df_cleandim = df_cleanres.copy()

In [17]:
df_cleandim = df_cleandim.reset_index()
df_cleandim.drop(['index'], axis=1, inplace=True)

In [18]:
df_splitdim = pd.DataFrame(df_cleandim.dimensions.str.split('x').tolist(), columns="length width thickness extra1 extra2".split())

In [19]:
df_cleandim['length'] = df_splitdim.length
df_cleandim['width'] = df_splitdim.width
df_cleandim['thickness'] = df_splitdim.thickness

In [20]:
df_cleandim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2899 entries, 0 to 2898
Data columns (total 12 columns):
dimensions          2899 non-null object
weight_g            2899 non-null float64
display_size        2899 non-null object
internal_memory     2899 non-null object
RAM                 2899 non-null object
primary_camera      2899 non-null object
battery             2899 non-null object
approx_price_EUR    2899 non-null float64
new_res             2899 non-null float64
length              2899 non-null object
width               2881 non-null object
thickness           2880 non-null object
dtypes: float64(3), object(9)
memory usage: 271.9+ KB


In [21]:
df_cleandim = df_cleandim.dropna()

In [22]:
df_cleandim = df_cleandim.drop(['dimensions'], axis=1)

In [23]:
df_cleandim.drop(df_cleandim[df_cleandim.length == '- '].index, inplace=True)

In [24]:
df_cleandim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2879 entries, 0 to 2898
Data columns (total 11 columns):
weight_g            2879 non-null float64
display_size        2879 non-null object
internal_memory     2879 non-null object
RAM                 2879 non-null object
primary_camera      2879 non-null object
battery             2879 non-null object
approx_price_EUR    2879 non-null float64
new_res             2879 non-null float64
length              2879 non-null object
width               2879 non-null object
thickness           2879 non-null object
dtypes: float64(3), object(8)
memory usage: 269.9+ KB


In [25]:
df_cleandim['length'] = df_cleandim['length'].astype(float)

In [26]:
df_cleandim['width'] = df_cleandim['width'].astype(float)

In [27]:
#starting to clean thickness column

df_split_thick = pd.DataFrame(df_cleandim.thickness.str.split().tolist(), 
                              columns="thick extra1 extra2 extra3 a b c".split())

In [28]:
df_split_thick.tail()

Unnamed: 0,thick,extra1,extra2,extra3,a,b,c
2874,13.0,mm,(4.57,,,,
2875,12.6,mm,(7.56,,,,
2876,13.5,mm,(4.09,,,,
2877,11.8,mm,(4.57,,,,
2878,14.5,mm,(4.02,,,,


In [29]:
df_cleandim = df_cleandim.reset_index()
df_cleandim.drop(['index'], axis=1, inplace=True)

In [30]:
df_cleandim['new_thickness'] = df_split_thick.thick

In [31]:
df_cleandim.drop(['thickness'], axis=1, inplace=True)

In [32]:
df_cleandim.head(3)

Unnamed: 0,weight_g,display_size,internal_memory,RAM,primary_camera,battery,approx_price_EUR,new_res,length,width,new_thickness
0,260.0,720 x 1280 pixels (~210 ppi pixel density),16/32 GB,2 GB RAM,13 MP| autofocus,Non-removable Li-Ion 3400 mAh battery (12.92 Wh),170.0,7.0,191.7,101.0,9.4
1,169.0,1080 x 1920 pixels (~401 ppi pixel density),32 GB,3 GB RAM,13 MP| autofocus| LED flash,Removable Li-Po 4080 mAh battery,250.0,5.5,153.8,75.6,8.5
2,166.0,720 x 1280 pixels (~267 ppi pixel density),32 GB,3 GB RAM,13 MP| f/1.8| autofocus| LED flash|,Removable Li-Po 4020 mAh battery,230.0,5.5,153.3,78.8,8.5


## Clean Thickness Column

In [34]:
df_cleanthick = df_cleandim.copy()

In [35]:
df_cleanthick.drop(df_cleanthick[df_cleanthick.new_thickness.str.contains("-")]
                    .index, inplace=True)

df_cleanthick['new_thickness'] = df_cleanthick['new_thickness'].astype(float)

## Clean Battery Column

In [36]:
df_cleanbatt = df_cleanthick.copy()

In [37]:
df_cleanbatt = df_cleanbatt.reset_index()
df_cleanbatt.drop(['index'], axis=1, inplace=True)

In [38]:
df_cleanbatt['new_battery'] = df_cleanbatt.battery.str.extract('(\d+)')

In [39]:
df_cleanbatt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2873 entries, 0 to 2872
Data columns (total 12 columns):
weight_g            2873 non-null float64
display_size        2873 non-null object
internal_memory     2873 non-null object
RAM                 2873 non-null object
primary_camera      2873 non-null object
battery             2873 non-null object
approx_price_EUR    2873 non-null float64
new_res             2873 non-null float64
length              2873 non-null float64
width               2873 non-null float64
new_thickness       2873 non-null float64
new_battery         2820 non-null object
dtypes: float64(6), object(6)
memory usage: 269.4+ KB


In [40]:
df_cleanbatt = df_cleanbatt.dropna()

In [41]:
df_cleanbatt['new_battery'] = df_cleanbatt['new_battery'].astype(float)

In [42]:
df_cleanbatt.drop(['battery'], axis=1, inplace=True)

In [43]:
df_cleanbatt = df_cleanbatt.reset_index()
df_cleanbatt.drop(['index'], axis=1, inplace=True)

## Clean Camera Column

In [44]:
df_cleancam = df_cleanbatt.copy()

In [45]:
df_cleancam['primary_camera'] = df_cleancam.primary_camera.str.extract('(\d+)')

In [46]:
df_cleancam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2820 entries, 0 to 2819
Data columns (total 11 columns):
weight_g            2820 non-null float64
display_size        2820 non-null object
internal_memory     2820 non-null object
RAM                 2820 non-null object
primary_camera      2783 non-null object
approx_price_EUR    2820 non-null float64
new_res             2820 non-null float64
length              2820 non-null float64
width               2820 non-null float64
new_thickness       2820 non-null float64
new_battery         2820 non-null float64
dtypes: float64(7), object(4)
memory usage: 242.4+ KB


In [47]:
df_cleancam = df_cleancam.dropna()

In [48]:
df_cleancam['primary_camera'] = df_cleancam['primary_camera'].astype(float)

In [49]:
df_cleancam.drop(['display_size'], axis=1, inplace=True)

In [50]:
df_cleancam.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2783 entries, 0 to 2819
Data columns (total 10 columns):
weight_g            2783 non-null float64
internal_memory     2783 non-null object
RAM                 2783 non-null object
primary_camera      2783 non-null float64
approx_price_EUR    2783 non-null float64
new_res             2783 non-null float64
length              2783 non-null float64
width               2783 non-null float64
new_thickness       2783 non-null float64
new_battery         2783 non-null float64
dtypes: float64(8), object(2)
memory usage: 239.2+ KB


## Clean Memory Column

In [61]:
df_cleanmem = df_cleancam.copy()

In [62]:
#function that takes GB or MB, and converts it all into MB


def convert(cell):
    split = cell.split()
    if len(split) > 1:
        try:
            if split[1] == 'GB':
                return float(split[0])*1000
            if split[1] == 'MB':
                return float(split[0])
            else:
                return math.nan
        except ValueError:
            return math.nan

    else:
        return math.nan

In [63]:
df_cleanmem['new_memory'] = df_cleanmem['internal_memory'].apply(convert)

In [64]:
df_cleanmem = df_cleanmem.dropna()

In [65]:
df_cleanmem.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2382 entries, 1 to 2819
Data columns (total 11 columns):
weight_g            2382 non-null float64
internal_memory     2382 non-null object
RAM                 2382 non-null object
primary_camera      2382 non-null float64
approx_price_EUR    2382 non-null float64
new_res             2382 non-null float64
length              2382 non-null float64
width               2382 non-null float64
new_thickness       2382 non-null float64
new_battery         2382 non-null float64
new_memory          2382 non-null float64
dtypes: float64(9), object(2)
memory usage: 223.3+ KB


In [66]:
df_cleanmem.head(3)

Unnamed: 0,weight_g,internal_memory,RAM,primary_camera,approx_price_EUR,new_res,length,width,new_thickness,new_battery,new_memory
1,169.0,32 GB,3 GB RAM,13.0,250.0,5.5,153.8,75.6,8.5,4080.0,32000.0
2,166.0,32 GB,3 GB RAM,13.0,230.0,5.5,153.3,78.8,8.5,4020.0,32000.0
3,125.0,8 GB,1 GB RAM,8.0,110.0,5.0,145.7,71.2,8.4,2000.0,8000.0


## Clean RAM

In [67]:
df_cleanram = df_cleanmem.copy()

In [68]:
df_cleanram['new_ram'] = df_cleanram['RAM'].apply(convert)

In [69]:
df_cleanram = df_cleanram.dropna()

In [70]:
df_cleanram.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2352 entries, 1 to 2819
Data columns (total 12 columns):
weight_g            2352 non-null float64
internal_memory     2352 non-null object
RAM                 2352 non-null object
primary_camera      2352 non-null float64
approx_price_EUR    2352 non-null float64
new_res             2352 non-null float64
length              2352 non-null float64
width               2352 non-null float64
new_thickness       2352 non-null float64
new_battery         2352 non-null float64
new_memory          2352 non-null float64
new_ram             2352 non-null float64
dtypes: float64(10), object(2)
memory usage: 238.9+ KB


In [71]:
df_cleanram.head()

Unnamed: 0,weight_g,internal_memory,RAM,primary_camera,approx_price_EUR,new_res,length,width,new_thickness,new_battery,new_memory,new_ram
1,169.0,32 GB,3 GB RAM,13.0,250.0,5.5,153.8,75.6,8.5,4080.0,32000.0,3000.0
2,166.0,32 GB,3 GB RAM,13.0,230.0,5.5,153.3,78.8,8.5,4020.0,32000.0,3000.0
3,125.0,8 GB,1 GB RAM,8.0,110.0,5.0,145.7,71.2,8.4,2000.0,8000.0,1000.0
5,150.0,32 GB,3 GB RAM,21.0,220.0,5.5,156.5,75.9,8.4,2870.0,32000.0,3000.0
6,142.0,8 GB,1 GB RAM,5.0,110.0,4.5,136.0,66.5,9.6,2000.0,8000.0,1000.0


In [72]:
df_cleanram.drop(['internal_memory', 'RAM'], axis = 1, inplace = True)

In [73]:
df_cleanram.head(3)

Unnamed: 0,weight_g,primary_camera,approx_price_EUR,new_res,length,width,new_thickness,new_battery,new_memory,new_ram
1,169.0,13.0,250.0,5.5,153.8,75.6,8.5,4080.0,32000.0,3000.0
2,166.0,13.0,230.0,5.5,153.3,78.8,8.5,4020.0,32000.0,3000.0
3,125.0,8.0,110.0,5.0,145.7,71.2,8.4,2000.0,8000.0,1000.0


In [74]:
df_cleanram.columns = ['weight', 'camera', 'price', 'screen_size', 'length', 'width', 
                       'thickness', 'battery', 'memory', 'RAM']

In [75]:
df_cleanram.head()

Unnamed: 0,weight,camera,price,screen_size,length,width,thickness,battery,memory,RAM
1,169.0,13.0,250.0,5.5,153.8,75.6,8.5,4080.0,32000.0,3000.0
2,166.0,13.0,230.0,5.5,153.3,78.8,8.5,4020.0,32000.0,3000.0
3,125.0,8.0,110.0,5.0,145.7,71.2,8.4,2000.0,8000.0,1000.0
5,150.0,21.0,220.0,5.5,156.5,75.9,8.4,2870.0,32000.0,3000.0
6,142.0,5.0,110.0,4.5,136.0,66.5,9.6,2000.0,8000.0,1000.0


In [77]:
df_cleanram.describe()

Unnamed: 0,weight,camera,price,screen_size,length,width,thickness,battery,memory,RAM
count,2352.0,2352.0,2352.0,2352.0,2352.0,2352.0,2352.0,2352.0,2352.0,2352.0
mean,161.312075,7.679847,218.439201,4.593299,137.450085,72.28801,10.587415,2222.747874,11959.159439,1196.326531
std,90.14823,13.081915,409.689745,1.476031,28.881939,22.436305,3.022322,1132.918618,16241.072256,1069.919344
min,47.0,0.0,10.0,1.56,44.0,36.8,4.7,1.0,8.0,8.0
25%,124.925,5.0,120.0,3.7,118.5,62.0,8.5,1500.0,4000.0,512.0
50%,142.0,5.0,170.0,4.5,133.5,67.85,9.9,2000.0,8000.0,1000.0
75%,163.0,10.0,250.0,5.2,146.8,73.9,12.0,2700.0,16000.0,2000.0
max,2005.0,600.0,11500.0,17.3,418.1,283.2,28.2,9600.0,256000.0,16000.0


In [78]:
df_cleanram.sort_values(by = 'weight', ascending = False)

Unnamed: 0,weight,camera,price,screen_size,length,width,thickness,battery,memory,RAM
642,2005.0,0.0,370.0,17.30,418.1,283.2,15.8,43.0,32000.0,2000.0
1966,952.5,5.0,900.0,10.10,266.7,213.4,17.8,4590.0,16000.0,1000.0
643,850.0,8.0,490.0,12.30,300.3,222.0,8.0,21.0,32000.0,2000.0
798,709.0,8.0,280.0,10.10,251.0,178.0,13.0,7300.0,32000.0,1000.0
2008,680.0,2.0,230.0,10.10,175.0,256.8,10.2,6400.0,16000.0,1000.0
2567,671.3,8.0,440.0,10.10,261.6,177.8,10.2,33.0,32000.0,2000.0
1179,666.8,13.0,480.0,10.10,246.9,179.1,4.7,10.0,64000.0,4000.0
2009,660.0,2.0,190.0,10.10,175.0,256.8,10.2,6400.0,16000.0,1000.0
1638,650.0,5.0,230.0,10.10,260.0,177.0,11.4,24.0,16000.0,1000.0
205,645.4,2.0,160.0,9.70,242.5,178.0,8.9,8000.0,8000.0,1000.0


In [80]:
df_cleanram.drop(df_cleanram[df_cleanram.weight == 2005].index, inplace=True)

In [81]:
df_cleanram.sort_values(by = 'camera', ascending = False)

Unnamed: 0,weight,camera,price,screen_size,length,width,thickness,battery,memory,RAM
1799,78.8,600.0,50.0,1.8,108.0,45.0,14.0,800.0,10.0,16.0
1756,169.0,41.0,280.0,4.0,123.9,60.2,13.9,1400.0,16000.0,1000.0
1733,158.0,41.0,400.0,4.5,130.4,71.4,10.4,2000.0,32000.0,2000.0
626,210.0,24.0,480.0,6.0,164.0,82.3,9.6,3500.0,64000.0,3000.0
425,207.0,24.0,310.0,6.0,164.0,82.2,9.3,3500.0,64000.0,3000.0
2016,210.0,24.0,400.0,6.0,164.0,82.3,9.6,3500.0,64000.0,3000.0
188,204.4,24.0,490.0,6.0,164.0,82.0,9.6,3500.0,64000.0,3000.0
2433,138.0,23.0,410.0,4.6,127.0,65.0,8.9,2700.0,32000.0,2000.0
2421,135.0,23.0,370.0,4.6,129.0,65.0,9.5,2700.0,32000.0,3000.0
2422,161.0,23.0,610.0,5.2,146.0,72.0,8.1,2900.0,32000.0,3000.0


In [82]:
df_cleanram.drop(df_cleanram[df_cleanram.camera == 600].index, inplace=True)

In [83]:
df_cleanram.sort_values(by = 'price', ascending = False)

Unnamed: 0,weight,camera,price,screen_size,length,width,thickness,battery,memory,RAM
1929,180.0,13.0,11500.0,5.50,154.6,77.0,9.4,4000.0,16000.0,3000.0
2720,152.0,13.0,9000.0,5.00,142.0,69.6,8.7,3000.0,32000.0,4000.0
2580,192.0,13.0,8290.0,4.70,145.0,69.0,10.7,2275.0,64000.0,2000.0
1105,140.0,5.0,6099.0,4.50,135.0,67.2,9.5,2000.0,8000.0,1000.0
2579,193.0,13.0,5440.0,4.70,143.4,69.0,11.2,2275.0,64000.0,2000.0
1104,141.0,5.0,5400.0,4.00,123.0,63.7,10.1,1500.0,8000.0,1000.0
907,169.0,20.0,1400.0,5.50,152.0,75.0,7.5,4000.0,256000.0,6000.0
611,238.0,12.0,1150.0,5.70,155.2,77.6,7.4,7000.0,128000.0,6000.0
2038,173.0,12.0,900.0,6.20,159.5,73.4,8.1,3500.0,64000.0,4000.0
1966,952.5,5.0,900.0,10.10,266.7,213.4,17.8,4590.0,16000.0,1000.0


In [84]:
df_cleanram.drop(df_cleanram[df_cleanram.price > 5000].index, inplace=True)

In [85]:
df_cleanram.describe()

Unnamed: 0,weight,camera,price,screen_size,length,width,thickness,battery,memory,RAM
count,2344.0,2344.0,2344.0,2344.0,2344.0,2344.0,2344.0,2344.0,2344.0,2344.0
mean,160.547867,7.423635,199.496587,4.588712,137.335111,72.217449,10.58541,2223.553754,11904.408276,1194.003413
std,81.846145,4.672745,123.055089,1.453705,28.332823,22.039365,3.024261,1132.75865,16184.379931,1068.846415
min,47.0,1.0,10.0,1.56,44.0,36.8,4.7,1.0,8.0,8.0
25%,124.675,5.0,120.0,3.7,118.5,62.0,8.5,1500.0,4000.0,512.0
50%,142.0,5.0,170.0,4.5,133.3,67.8,9.9,2000.0,8000.0,1000.0
75%,163.0,8.0,250.0,5.2,146.8,73.9,12.0,2700.0,16000.0,2000.0
max,952.5,41.0,1400.0,12.3,300.3,256.8,28.2,9600.0,256000.0,16000.0


## Pickled clean data

In [86]:
with open('modeling_data', 'wb') as to_write:
    pickle.dump(df_cleanram, to_write)