In [19]:
# import resources
import pandas as pd
import seaborn as sns
from sklearn import linear_model

In [2]:
# loading seaborn dataset
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# define linear regression model to a variable name
lr = linear_model.LinearRegression()

In [5]:
# fit the data using the defined variable
# independent variables are X (total bill and size)
# dependent variable is Y (tip)
# we are predicting if the total bill and size of the party influences the tip given
lr.fit(X = tips[['total_bill', 'size']], y = tips['tip'])

LinearRegression()

In [6]:
# finding out the coefficients of the regression equation
# outputs an array
# first element in array is coefficient of total bill and second is for size
lr.coef_

array([0.09271334, 0.19259779])

In [7]:
# this gives intercept value of the regression equation
lr.intercept_

0.6689447408125027

In [8]:
# we try to fit sex as well in addition to total bill and size as predictors
# but we get error because sex is not a numerical variable
lr.fit(X = tips[['total_bill', 'size', 'sex']], y = tips['tip'])

ValueError: could not convert string to float: 'Female'

In [9]:
# we get dummy values for the dataframe by using the following command
pd.get_dummies(tips)

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,3,1,0,0,1,0,0,0,1,0,1
2,21.01,3.50,3,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,2,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1,0,0,1,0,0,1
240,27.18,2.00,2,0,1,1,0,0,0,1,0,0,1
241,22.67,2.00,2,1,0,1,0,0,0,1,0,0,1
242,17.82,1.75,2,1,0,0,1,0,0,1,0,0,1


In [11]:
# the first (or last) column created as dummy variable for every column in original df
# is redundant and can be removed using the following command
tips_dummy = pd.get_dummies(tips, drop_first=True)

In [12]:
tips_dummy.head()

Unnamed: 0,total_bill,tip,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,1.01,2,1,1,0,0,1,1
1,10.34,1.66,3,0,1,0,0,1,1
2,21.01,3.5,3,0,1,0,0,1,1
3,23.68,3.31,2,0,1,0,0,1,1
4,24.59,3.61,4,1,1,0,0,1,1


In [14]:
# fitting linear regression again using 'dummified' df
# predictor columns from 3rd to last [2:], and all the rows (:)
# (size, sex, smoker, day, time) are predictors here
lr = linear_model.LinearRegression()
lr.fit(X=tips_dummy.iloc[:, 2:], y=tips_dummy['tip'])

LinearRegression()

In [15]:
# these are coefficients of the columns selected to be predictors
# interpretation: if time is dinner (=1), tip increases in general by $0.4857
# if person is a female, tip reduces by $0.10 in general
# if a person does not smoke, tip reduced by $ 0.20 in general
lr.coef_

array([ 0.71001644, -0.10057881, -0.20916402, -0.20180568, -0.36603136,
       -0.29452609,  0.48575489])

In [4]:
# load titanic seaborn dataset
titanic = sns.load_dataset('titanic')

In [5]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [7]:
# selecting subset
titanic_subset = titanic[['survived', 'class', 'who']]

In [8]:
titanic_subset.head()

Unnamed: 0,survived,class,who
0,0,Third,man
1,1,First,woman
2,1,Third,woman
3,1,First,woman
4,0,Third,man


In [11]:
# get dummyfied df and drop first redundant dummy variable column
titanic_dummy = pd.get_dummies(titanic_subset, drop_first=True)

In [12]:
titanic_dummy.head()

Unnamed: 0,survived,class_Second,class_Third,who_man,who_woman
0,0,0,1,1,0
1,1,0,0,0,1
2,1,0,1,0,1
3,1,0,0,0,1
4,0,0,1,1,0


In [14]:
# fit a logistic regression model when dependent variable is yes-no type

In [15]:
# x (independent or predictor) is all the rows and second to last column
X = titanic_dummy.iloc[:, 1:]

In [16]:
# y (dependent) is first column - survived or not
y = titanic_dummy.iloc[:, 0]

In [21]:
# import resources for logistic regression
from sklearn.linear_model import LogisticRegression

In [22]:
# give variable name to logistic regression object
logreg = LogisticRegression()

In [23]:
logreg

LogisticRegression()

In [24]:
# fitting the selected data
logreg.fit(X, y)

LogisticRegression()

In [25]:
# finding coefficients
# third class have more chance to not survive
# female had higher chance of surviving as compared to men
logreg.coef_

array([[-0.86376384, -1.95620676, -2.37998804,  0.45872064]])

In [26]:
logreg = LogisticRegression(multi_class = 'multinomial').fit(X, y)

In [27]:
logreg.coef_

array([[-0.46549873, -1.01898083, -1.23823732,  0.20337229]])