
# Linear Regression in Python 3.x
## Multiple Linear Regression for ARC Welding Data
### Anirudh Jonnalagadda, PhD
##### Shell Postdoctoral Fellow @ CDS, IISc
###### (anirudhj@iisc.ac.in)

Data taken from [Pal _et. al_ (2008) Journal of materials processing technology](https://doi.org/10.1016/j.jmatprotec.2007.09.039)

### Linear Regression Using Scikit-learn

In [4]:
# # for google colab
# !git clone https://github.com/jAnirudh/SVNIT.git

In [5]:
# # for google colab
# import os
# os.chdir('SVNIT') # change directory

In [6]:
import pandas
dataframe = pandas.read_csv('arc_welding.csv')

In [7]:
# Lets see the contents of the dataframe
dataframe.head() # top 5 rows

Unnamed: 0,Experiment no.,Background voltage (VB),Pulse voltage (Vp),Pulse frequency (Hz),Pulse duty factor,Wire feed rate (m/min),Table feed rate (mm/s),RMS current (V),RMS voltage (V),UTS (MPa)
0,1,17,34.6,130,0.5,9,3.76,1.1939,2.7429,412.28
1,2,17,34.6,130,0.5,9,3.76,1.1415,2.7449,415.79
2,3,14,30.0,80,0.35,11,5.635,1.4385,1.6834,0.0
3,4,14,39.0,80,0.35,7,5.635,1.1971,2.719,328.71
4,5,14,30.0,182,0.65,11,5.635,1.2566,2.3814,385.98


In [8]:
dataframe.tail() # last 5 rows

Unnamed: 0,Experiment no.,Background voltage (VB),Pulse voltage (Vp),Pulse frequency (Hz),Pulse duty factor,Wire feed rate (m/min),Table feed rate (mm/s),RMS current (V),RMS voltage (V),UTS (MPa)
48,49,20,39.0,80,0.65,11,2.456,1.3634,3.2698,453.11
49,50,20,39.0,80,0.35,11,5.635,1.3265,2.7507,367.01
50,51,14,39.0,182,0.35,7,2.456,1.107,2.6931,445.03
51,52,17,34.6,130,0.5,9,3.76,1.1947,2.6984,413.43
52,53,20,34.6,130,0.5,9,3.76,1.1786,2.6165,349.2


In [9]:
# how many rows and columns do we have?
print('nrows = {:}; ncolumns = {:}'.format(len(dataframe), dataframe.columns.size))

nrows = 53; ncolumns = 10


In [10]:
# if you have larger number of columns?
dataframe.columns

Index(['Experiment no.', ' Background voltage (VB)', ' Pulse voltage (Vp)',
       ' Pulse frequency (Hz)', ' Pulse duty factor',
       ' Wire feed rate (m/min)', ' Table feed rate (mm/s)',
       ' RMS current (V)', ' RMS voltage (V)', ' UTS (MPa)'],
      dtype='object')

In [11]:
# let's drop the "Experiment no." column
df = dataframe.drop('Experiment no.', axis = 1) # axis = 1 for a column, 0 for a row

In [12]:
df.head()

Unnamed: 0,Background voltage (VB),Pulse voltage (Vp),Pulse frequency (Hz),Pulse duty factor,Wire feed rate (m/min),Table feed rate (mm/s),RMS current (V),RMS voltage (V),UTS (MPa)
0,17,34.6,130,0.5,9,3.76,1.1939,2.7429,412.28
1,17,34.6,130,0.5,9,3.76,1.1415,2.7449,415.79
2,14,30.0,80,0.35,11,5.635,1.4385,1.6834,0.0
3,14,39.0,80,0.35,7,5.635,1.1971,2.719,328.71
4,14,30.0,182,0.65,11,5.635,1.2566,2.3814,385.98


In [13]:
# say you want to see the values of a particular column
df['Background voltage (VB)']

KeyError: 'Background voltage (VB)'

In [14]:
# This is because you generally do not know how the columns are named
df[df.columns[0]]

0     17
1     17
2     14
3     14
4     14
5     20
6     14
7     17
8     20
9     17
10    17
11    17
12    14
13    20
14    20
15    17
16    17
17    14
18    14
19    14
20    17
21    17
22    20
23    17
24    20
25    17
26    20
27    14
28    17
29    14
30    20
31    20
32    20
33    17
34    14
35    14
36    20
37    17
38    14
39    20
40    17
41    20
42    14
43    14
44    14
45    17
46    20
47    17
48    20
49    20
50    14
51    17
52    20
Name:  Background voltage (VB), dtype: int64

In [15]:
# It is therefore generally easier to just rename the columns more to be legible
df.columns = ['background_voltage', 'pulse_voltage', 'pulse_frequency', 'pulse_duty_factor', 
              'wire_feed_rate', 'table_feed_rate', 'rms_current', 'rms_voltage', 'uts']

In [16]:
df.head()

Unnamed: 0,background_voltage,pulse_voltage,pulse_frequency,pulse_duty_factor,wire_feed_rate,table_feed_rate,rms_current,rms_voltage,uts
0,17,34.6,130,0.5,9,3.76,1.1939,2.7429,412.28
1,17,34.6,130,0.5,9,3.76,1.1415,2.7449,415.79
2,14,30.0,80,0.35,11,5.635,1.4385,1.6834,0.0
3,14,39.0,80,0.35,7,5.635,1.1971,2.719,328.71
4,14,30.0,182,0.65,11,5.635,1.2566,2.3814,385.98


### Let's do regression

In [17]:
# isolate the dependant variables
X = df[df.columns[:-1]]
X.head()

Unnamed: 0,background_voltage,pulse_voltage,pulse_frequency,pulse_duty_factor,wire_feed_rate,table_feed_rate,rms_current,rms_voltage
0,17,34.6,130,0.5,9,3.76,1.1939,2.7429
1,17,34.6,130,0.5,9,3.76,1.1415,2.7449
2,14,30.0,80,0.35,11,5.635,1.4385,1.6834
3,14,39.0,80,0.35,7,5.635,1.1971,2.719
4,14,30.0,182,0.65,11,5.635,1.2566,2.3814


In [18]:
# Isolate the inpendant variable
Y = df[df.columns[-1]]

In [None]:
# Create the regression object
from sklearn import linear_model
model = linear_model.LinearRegression()

In [None]:
# fit
model.fit(df[df.columns[:-1]], df[df.columns[-1]])

In [None]:
# get the regression coefficients
model.coef_

In [None]:
# get the regression intercept
model.intercept_