# Fliter Method

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression


In [None]:
df = pd.read_csv("cars.csv")
df

In [None]:
# Removing junk data from the dataset
df["normalized-losses"].replace("?",np.nan,inplace=True)
df["normalized-losses"] = df["normalized-losses"].astype(float)
losses_mean = df["normalized-losses"].mean()
df["normalized-losses"].fillna(losses_mean,inplace=True)

In [None]:
df["horsepower"].replace("?",np.nan,inplace=True)
df["horsepower"] = df["horsepower"].astype(float)
horsepower_mean = df["horsepower"].mean()
df["horsepower"].fillna(horsepower_mean,inplace=True)

In [None]:
# Adding Labels to categorical data
df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes("object")
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

In [None]:
# Creating new dataframe with changed values of the categorical data
df_new = pd.concat([df_num,df_cat],axis=1)

In [None]:
df_new.head()

In [None]:
#Splitting dataset for the baseline model
X = df_new.drop("price",axis=1)
y = df_new["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
                                                 random_state=1)

In [None]:
# Baseline model
lin = LinearRegression()
lin.fit(X_train,y_train)

In [None]:
lin.score(X_test,y_test)

In [None]:
df.corr()

In [None]:
#Feature enginerring
df_new['size'] = df_new['width']*df_new['height']
df_new.head()

In [None]:
#Splitting dataset for the baseline model
cor_df = df_new.drop("width", axis=1)
cor_df.drop("height", axis=1, inplace=True)
cor_df.drop("symboling", axis=1, inplace=True)
cor_df.drop("normalized-losses", axis=1, inplace=True)
X = cor_df.drop("price",axis=1)
y = cor_df["price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
                                                 random_state=1)

In [None]:
lin = LinearRegression()
lin.fit(X_train,y_train)
print(lin.score(X_test, y_test))

In [2]:
#score_func of Chi Squared Test
from sklearn.feature_selection import chi2
# score_func of Annova
from sklearn.feature_selection import f_regression
# class that accepts the score_func parameters
from sklearn.feature_selection import SelectKBest

In [None]:
X = df_new.drop("price",axis=1)
y = df_new["price"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
                                                 random_state=1)

In [None]:
annova = SelectKBest(score_func=f_regression,k=10)


In [None]:
X_train_annova = annova.fit_transform(X_train,y_train)
X_test_annova = annova.transform(X_test)
lin_annova = LinearRegression()
lin_annova.fit(X_train_annova, y_train)

In [None]:
lin_annova.score(X_test_annova, y_test)


In [None]:
chi = SelectKBest(score_func=chi2,k=7)

# this will throw an error as it works on non-negative values
X_train_chi = chi.fit_transform(X_train,y_train)

X_test_chi = chi.transform(X_test)
lin_chi = LinearRegression()
lin_chi.fit(X_train_chi, y_train)

In [None]:
data = pd.read_csv("mobile.csv")
X = data.iloc[:,0:20]  #independent columns
y = data.iloc[:,-1]    #target column i.e price range

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)

In [None]:
bestfeatures.fit(X,y)

In [None]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

In [None]:
featureScores

In [None]:
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
data=(featureScores.nlargest(10,'Score'))  #print 10 best features
data

In [9]:
data = pd.read_csv("mobile.csv")
X = data.iloc[:,0:20]  #independent columns
y = data.iloc[:,-1]    #targe

In [16]:
data

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,0
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,1
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,1
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,1
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,1
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,1
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [14]:
X

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,14,1222,1890,668,13,4,19,1,1,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,3,915,1965,2032,11,10,16,1,1,1
1997,1911,0,0.9,1,1,1,36,0.7,108,8,3,868,1632,3057,9,1,5,1,1,0
1998,1512,0,0.9,0,4,1,46,0.1,145,5,5,336,670,869,18,10,19,1,1,1


In [15]:
y

0       0
1       1
2       1
3       1
4       0
       ..
1995    0
1996    1
1997    1
1998    0
1999    1
Name: price_range, Length: 2000, dtype: int64

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
                                                 random_state=1)

In [11]:
chi = SelectKBest(score_func=chi2,k=10)

# this will throw an error as it works on non-negative values
X_train_chi = chi.fit_transform(X_train,y_train)

X_test_chi = chi.transform(X_test)
lin_chi = LinearRegression()
lin_chi.fit(X_train_chi, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
lin_chi.score(X_test_chi, y_test)

0.6889656393341109

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
log=LogisticRegression()

In [19]:
log.fit(X_train_chi, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
log.score(X_test_chi, y_test)

0.9616666666666667