## Dimension Reduction Practice

Reference: 
- https://www.analyticsvidhya.com/blog/2018/08/dimensionality-reduction-techniques-python/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('./dataset/Train_UWu5bXk.csv')
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Missing values

- set threshold at 20%, if ratio of missing values > 0.2 then remove that variable

In [3]:
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [4]:
# percent
train.isnull().sum() / (len(train)) * 100

Item_Identifier               0.000000
Item_Weight                  17.165317
Item_Fat_Content              0.000000
Item_Visibility               0.000000
Item_Type                     0.000000
Item_MRP                      0.000000
Outlet_Identifier             0.000000
Outlet_Establishment_Year     0.000000
Outlet_Size                  28.276428
Outlet_Location_Type          0.000000
Outlet_Type                   0.000000
Item_Outlet_Sales             0.000000
dtype: float64

In [5]:
RATIO_THRESHOLD = 20
ratio = train.isnull().sum() / len(train) * 100

variables = []
for i in range(len(train.columns)):
    if ratio[i] <= RATIO_THRESHOLD:
        variables.append(train.columns[i])

variables

['Item_Identifier',
 'Item_Weight',
 'Item_Fat_Content',
 'Item_Visibility',
 'Item_Type',
 'Item_MRP',
 'Outlet_Identifier',
 'Outlet_Establishment_Year',
 'Outlet_Location_Type',
 'Outlet_Type',
 'Item_Outlet_Sales']

In [6]:
# OR

train.Item_Weight.fillna(train.Item_Weight.median, inplace=True)
train.Outlet_Size.fillna(train.Outlet_Size.mode()[0], inplace=True)

In [7]:
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

### Low variance filter

- drop variables with low variance since it does not affect the target variable

In [8]:
train.var()

Item_Visibility              2.662335e-03
Item_MRP                     3.878184e+03
Outlet_Establishment_Year    7.008637e+01
Item_Outlet_Sales            2.912141e+06
dtype: float64

Item_Visibility variance is less compared to others -> drop

In [9]:
numeric = train[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']]
var = numeric.var()
numeric = numeric.columns
variables = [ ]
for i in range(0,len(var)):
    if var[i]>=10:   #setting the threshold as 10%
        variables.append(numeric[i+1])
variables

['Item_MRP', 'Outlet_Establishment_Year']

### High Correlation filter
- high correlation between two variables = similar trends and likely to contain similar information

- in general, if corr is greater than 0.5-0.6, we should consider dropping one of variables

In [10]:
df = train.drop('Item_Outlet_Sales', 1)
df.corr()

Unnamed: 0,Item_Visibility,Item_MRP,Outlet_Establishment_Year
Item_Visibility,1.0,-0.001315,-0.074834
Item_MRP,-0.001315,1.0,0.00502
Outlet_Establishment_Year,-0.074834,0.00502,1.0


### Random forest 
- feature selection

In [11]:
df = df.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)

### Backward Feature Elimination

- take n variables -> calculate performance -> eliminate -> repeat

In [17]:
# from sklearn.linear_model import LinearRegression
# from sklearn.feature_selection import RFE
# from sklearn import datasets

# lreg = LinearRegression()
# rfe = RFE(lreg, 10)
# rfe = rfe.fit_transform(df, train.Item_Outlet_Sales)

In [16]:
# from sklearn.feature_selection import f_regression
# ffs = f_regression(df, train.Item_Outlet_Sales)