<a href="https://colab.research.google.com/github/karman07/thapar_summer_school/blob/master/Sample_Linear_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing neccessary Libraries**

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

**Loading data into code**

In [17]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,Row#,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,68.0,12.5,0.25,0.25,0.25,0.5,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.421449,0.403129,31.394569,4952.01304
1,1,514.0,25.0,0.5,0.25,0.5,0.75,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.586603,0.49002,40.282376,7532.82953
2,2,547.0,12.5,0.25,0.25,0.5,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.483671,0.411591,34.781055,5715.0084
3,3,535.0,25.0,0.5,0.25,0.5,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.429001,0.398864,31.119881,4575.76991
4,4,223.0,12.5,0.25,0.25,0.5,0.75,77.4,46.8,64.7,55.8,27.0,45.8,1.0,0.1,0.546136,0.475965,39.096884,6801.32393


In [18]:
df = df.drop(columns=["id", "Row#"], errors="ignore")
df.head()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,12.5,0.25,0.25,0.25,0.5,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.421449,0.403129,31.394569,4952.01304
1,25.0,0.5,0.25,0.5,0.75,94.6,57.2,79.0,68.2,33.0,55.9,1.0,0.1,0.586603,0.49002,40.282376,7532.82953
2,12.5,0.25,0.25,0.5,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.483671,0.411591,34.781055,5715.0084
3,25.0,0.5,0.25,0.5,0.75,69.7,42.1,58.2,50.2,24.3,41.2,24.0,0.39,0.429001,0.398864,31.119881,4575.76991
4,12.5,0.25,0.25,0.5,0.75,77.4,46.8,64.7,55.8,27.0,45.8,1.0,0.1,0.546136,0.475965,39.096884,6801.32393


**Outlier removal using percentile method**

In [19]:
q1, q3 = df["yield"].quantile([0.25, 0.75])
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
df = df[(df["yield"] >= lower) & (df["yield"] <= upper)]

**Getting top 10 features**

In [20]:
correlations = df.corr(numeric_only=True)["yield"].abs().sort_values(ascending=False)
top_features = correlations[1:11].index.tolist()

**Preparing Data for train**

In [21]:
X = df[top_features]
y = df["yield"]

**Normalization of data**

In [22]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

**PCA**

In [23]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

**Linear Regression Model Prep**

In [24]:
model = LinearRegression()
model.fit(X_pca, y)

In [25]:
y_pred = model.predict(X_pca)

In [26]:
mae = mean_absolute_error(y, y_pred)

In [27]:
mae

317.0103751365328

In [28]:
r2 = r2_score(y, y_pred)

In [29]:
r2

0.8875058639829301

**Creation of Submission File**

In [30]:
test_df = pd.read_csv("test.csv")
test_ids = test_df["id"]

test_features = test_df[top_features]

test_scaled = scaler.transform(test_features)

test_pca = pca.transform(test_scaled)

test_preds = model.predict(test_pca)

submission = pd.DataFrame({
    "id": test_ids,
    "target": test_preds
})

submission.to_csv("Name_LastName_RollNo.csv", index=False)