In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

**Step 2: Importing dataset**

In [3]:
df=pd.read_csv("Data.csv")
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


####Shape in terms of row and column.

In [4]:
df.shape

(10, 4)

####Null value in dataset

In [5]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

#####There is one null value in Age column and one null value in Salary column

####Data type of the column

In [6]:
df.dtypes

Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object

**Step 3: Handling the missing data**

In [7]:
df.dropna(inplace=True)

**Step 4: Encoding categorical data**

In [8]:
df["Purchased"].value_counts()

No     4
Yes    4
Name: Purchased, dtype: int64

#####Apply lable encoding to Purchased column.

In [9]:
df['Purchased'].replace('No',0,inplace=True)
df['Purchased'].replace('Yes',1,inplace=True)

**Step 5: Creating a dummy variable**

In [10]:
df["Country"].value_counts()

France     4
Spain      2
Germany    2
Name: Country, dtype: int64

In [11]:
df = pd.get_dummies(df, columns = ['Country']) 
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
5,35.0,58000.0,1,1,0,0
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [14]:
X = df[['Age', 'Salary','Country_France','Country_Germany','Country_Spain']].values #array of features
y = df['Purchased'].values #array of targets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
from sklearn.preprocessing import StandardScaler ## standrard scalig 
scaler = StandardScaler() #initialise to a variable
scaler.fit(X_train,y_train) # we are finding the values of mean and sd from the td
X_train_scaled = scaler.transform(X_train) # fit (mean, sd) and then transform the training data
X_test_scaled = scaler.transform(X_test) # transform the test data 

In [16]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

LinearRegression()

In [17]:
coeff_df = pd.DataFrame(regressor.coef_,['Age', 'Salary','Country_France','Country_Germany','Country_Spain'], columns=['Coefficient'])
y_pred = regressor.predict(X_test_scaled)
coeff_df

Unnamed: 0,Coefficient
Age,-0.981352
Salary,0.738267
Country_France,0.101188
Country_Germany,0.0
Country_Spain,-0.101188


In [18]:
regressor.intercept_ # c 

0.6666666666666667

In [19]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
0,0,0.245517
1,0,1.045594


**Step 7: Feature Scaling**

In [21]:
from sklearn.feature_selection import RFE
estimator = LinearRegression()
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train_scaled, y_train)
list(zip(['Age', 'Salary','Country_France','Country_Germany','Country_Spain'],selector.support_))

[('Age', True),
 ('Salary', True),
 ('Country_France', False),
 ('Country_Germany', False),
 ('Country_Spain', True)]

In [22]:
!pip install scikit-learn==0.24.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 2.2 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.4 requires scikit-learn>=1.0.0, but you have scikit-learn 0.24.2 which is incompatible.[0m
Successfully installed scikit-learn-0.24.2


In [23]:
from sklearn.feature_selection import SequentialFeatureSelector
estimator = LinearRegression()
sfs = SequentialFeatureSelector(estimator, n_features_to_select=2)
sfs.fit(X_train_scaled, y_train)
print(list(zip(['Age', 'Salary','Country_France','Country_Germany','Country_Spain'],sfs.get_support())))

[('Age', True), ('Salary', True), ('Country_France', False), ('Country_Germany', False), ('Country_Spain', False)]


  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------
  -------


In [24]:
X_trans = sfs.transform(X)
X_trans # feature selection

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [3.5e+01, 5.8e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])