# Take Regression Dataset from github
# Apply Linear Regression algorithm with one variable and multiple variables

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Fish.csv')
df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [2]:
df.isnull().sum()

Species    0
Weight     0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

In [3]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
154    False
155    False
156    False
157    False
158    False
Length: 159, dtype: bool

In [4]:
x = df[['Width']]
x

Unnamed: 0,Width
0,4.0200
1,4.3056
2,4.6961
3,4.4555
4,5.1340
...,...
154,1.3936
155,1.2690
156,1.2558
157,2.0672


In [5]:
y = df['Weight']
y

0      242.0
1      290.0
2      340.0
3      363.0
4      430.0
       ...  
154     12.2
155     13.4
156     12.2
157     19.7
158     19.9
Name: Weight, Length: 159, dtype: float64

In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x, y)

LinearRegression()

In [7]:
predicted = model.predict(x)
predicted

array([ 323.50033766,  377.26412491,  450.77518555,  405.48258327,
        533.20922768,  494.31707627,  560.41114385,  449.62686936,
        478.57949709,  500.34103002,  527.59942075,  473.08263929,
        389.01083473,  521.68841613,  540.1367745 ,  617.16808311,
        561.71005887,  545.16301091,  533.17157797,  644.95356979,
        615.19147328,  577.71118603,  560.71234153,  720.81773737,
        618.8623201 ,  706.24729923,  713.17484604,  668.27756573,
        733.58098938,  809.74635464,  753.89300844,  751.20105411,
        837.36241763,  836.9106211 ,  765.97856566,   -6.3111303 ,
         97.92209448,  113.49024996,  164.35500946,  239.57913195,
        197.67500365,  205.97676492,  186.88836147,  273.50152152,
        234.6093701 ,  285.90710128,  192.6675921 ,  282.08565561,
        288.71200474,  250.30929957,  343.68058274,  302.04000242,
        413.25724858,  465.36444854,  574.81215829,  366.34570873,
        366.51513243,  448.0455815 ,  802.0281639 ,  804.21184

In [8]:
x.loc[10]

Width    5.1042
Name: 10, dtype: float64

In [9]:
y.loc[10]

475.0

In [10]:
model.predict([[5.1042]])

array([527.59942075])

In [11]:
from sklearn.metrics import r2_score
print(r2_score(y, predicted) * 100)

78.58939611400793


In [12]:
print(y.min())

0.0


In [13]:
print(y.max())

1650.0


In [14]:
print(predicted.min())

-236.04966654189948


In [15]:
print(predicted.max())

1099.4608804375837


In [16]:
y = model.coef_*x.loc[10] + model.intercept_
print(y)

Width    527.599421
Name: 10, dtype: float64


In [17]:
print(x.loc[10])

Width    5.1042
Name: 10, dtype: float64


In [18]:
model.predict([[5.1042]])

array([527.59942075])

# Linear Regression with multiple variables

In [20]:
x = df[['Length1', 'Length2', 'Length3', 'Height', 'Width']]
x

Unnamed: 0,Length1,Length2,Length3,Height,Width
0,23.2,25.4,30.0,11.5200,4.0200
1,24.0,26.3,31.2,12.4800,4.3056
2,23.9,26.5,31.1,12.3778,4.6961
3,26.3,29.0,33.5,12.7300,4.4555
4,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...
154,11.5,12.2,13.4,2.0904,1.3936
155,11.7,12.4,13.5,2.4300,1.2690
156,12.1,13.0,13.8,2.2770,1.2558
157,13.2,14.3,15.2,2.8728,2.0672


In [21]:
y = df['Weight']
y

0      242.0
1      290.0
2      340.0
3      363.0
4      430.0
       ...  
154     12.2
155     13.4
156     12.2
157     19.7
158     19.9
Name: Weight, Length: 159, dtype: float64

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 42)

In [23]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [24]:
predicted = model.predict(x_test)
predicted

array([  84.56540736, -153.91155713,  425.83001137,  300.9526511 ,
        183.75680324,  774.82221354, -216.06698702,  273.48334184,
        271.35968826, 1183.17065768,  600.6868574 ,  799.90465341,
        576.29617583,  172.74512876,  657.51789093,  794.74954135,
        931.64361529,  365.14294371,  270.33812061,  586.46375597,
       -177.21961758,  629.43366787,  531.17459836,  507.17284112,
        799.71089791,  927.87423328,  227.74009208,  306.12445961,
       -240.50794103, -177.62822477,  747.17672447,   21.13808354,
          7.91663837,  880.13228557,  345.86165441,  176.50695297,
        169.32104364,  117.27824206,  229.05619636,  831.64455636,
        662.88073074,  807.93471514, -216.26749747,  162.74581424,
        176.6707879 ,   26.74887152,  135.32795001,  637.25711237,
        162.9880823 ,  112.67546159,  700.51106045,  526.82774648,
        618.87604218,  281.29555669,  954.57798878,  688.10781572,
       -198.78451969,  910.70927248,  123.83310412,  414.45105

In [25]:
print(r2_score(y_test, predicted) * 100)

87.03505929194627


In [26]:
ls = list(x.loc[10])
print(ls)

[28.4, 31.0, 36.2, 14.2628, 5.1042]


In [27]:
print(y.loc[10])

475.0


In [28]:
model.predict([ls])

array([512.58325758])