## Creating DataFrames

#### Creating DataFrame From A List

In [None]:
import pandas as pd

data = [['Axel',32], ['Alice', 26], ['Alex', 45]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

#### Creating DataFrame From A Dictionary

In [None]:
d2f = {'State':['Fl', 'GA', 'TN', 'AL'], 'Captial': ['Tallahassee', 'Atlanta', 'Nashville', 'Montgomery']
          , 'Name': ['John', 'Alex', 'Jim', 'Dan']}
df1 = pd.DataFrame(d2f)
df1

#### Change Index Of DataFrame

In [None]:
df = pd.DataFrame(d2f, index=['A', 'B', 'C', 'D'])
df

#### Appending A Row

In [None]:
import pandas as pd

new_row = pd.DataFrame([['KY', 'Lexington', 'Kyle']], columns = ['State', 'Captial', 'Name'], index=['E'])
df = df.append(new_row)
df

#### Appending Multiple Rows

In [None]:
data1 = {
    'Name': ['Microsoft Corporation', 'Google, LLC', 'Tesla, Inc.'],
    'Symbol': ['MSFT', 'GOOG', 'TSLA'],
    'Shares': [100, 50, 150]
}
df1 = pd.DataFrame(data1)
# print the original dataframe
print("The original dataframe:\n")
print(df1)
# The dataframe to append
data2 = {
    'Name':['Apple Inc.', 'Netflix, Inc.'],
    'Symbol':['APPL', 'NFLX'],
    'Shares': [200, 80]
}
df2 = pd.DataFrame(data2)
# print the dataframe to append
print("\nThe dataframe to append:\n")
print(df2)
# Append rows
df3 = df1.append(df2)  #ignore_index = True
print("\nThe appended dataframe:\n")
print(df3)

#### Adding A Column To DataFrame

In [None]:
import pandas as pd

df = pd.DataFrame({
    'Name': ['Jane', 'Mitch', 'Alex', 'Evan', 'Melissa'],
    'Location': ['Toronto', 'New York', 'Los Angeles', 'Vancouver', 'Seattle'],
    'Amount': [99.99, 123.12, 150.23, 52.34, 12.34]})

df

In [None]:
df['Country'] = ['Canada', 'USA', 'USA', 'Canada', 'USA']  #Add multiple values
#df['Company'] = 'datagy'   #Add single value
df

#### Adding Multiple Columns To DataFrame

In [None]:
df['New_Column'], df['New_Column_1'] = [[1,2,3,4,5], [6,7,8,9,0]]
df

#### Deleting A Row

In [None]:
d2f = {'State':['Fl', 'GA', 'TN', 'AL'], 'Captial': ['Tallahassee', 'Atlanta', 'Nashville', 'Montgomery']
          , 'Name': ['John', 'Alex', 'Jim', 'Dan']}
df1 = pd.DataFrame(d2f)
df1


#### Changing Row Index

In [None]:
df = pd.DataFrame(d2f, index=['A', 'B', 'C', 'D'])
df

#### Deleting A Row

In [None]:
df = df.drop(['C'], axis=0)   ## to drop multiple rows use a list (i.e. ['A', 'B', 'C'......])
df

#### Deleting A Column

In [None]:
df = df.drop(['Name'], axis = 1)
df

#### Renaming Column

In [None]:
print(df1)
print('\n')  ## adds space between lines
print(df1.rename(columns = {'Name': 'FirstName'}))


## Replacing Null(NaN) Values

In [None]:
import numpy as np

df = pd.DataFrame({'values': [700, np.nan, 500, np.nan]})
df

#### Using Fillna

In [None]:
df = df['values'].fillna(0)
df

#### Using Filna to replace Null(Nan) Values throughout entire dataframe

In [None]:
df = pd.DataFrame({'values_1': [700, np.nan, 500, np.nan],
                   'values_2': [np.nan, 150, np.nan, 400] ,
                   })
df

In [None]:
df.fillna(0)

#### Fillna Specific Columns Only

In [None]:
df = pd.DataFrame({'values_1': [700, np.nan, 500, np.nan],
                   'values_2': [np.nan, 150, np.nan, 400] ,
                   'values_3': [np.nan, np.nan, 300, 450]
                   })
df

In [None]:
df[['values_1', 'values_3']]= df[['values_1', 'values_3']].fillna(0)
df

## Dropping Missing Values

In [None]:

# Create the data of the DataFrame as a dictionary
data_dict = {'Name': ['Ankit', 'Aman', 'Riya', 'Ayush', 'Anushka'],
             'ID': [4001, 4002, np.nan, np.nan, 4010], 
             'Department': ['Technical', np.nan, np.nan, np.nan, 'Marketing'],
             'Qualification': [np.nan, np.nan, np.nan, np.nan, 'M.A.']}

# Create the DataFrame
df = pd.DataFrame(data_dict)
df


#### Dropping All Rows that have at least 1 missing value

In [None]:
df.dropna()

#### Dropping All Columns having at least 1 missing values

In [None]:
df.dropna(axis=1)

#### Dropping All Rows or Columns With Null(Nan) Values

In [None]:
data_dict = {'Name': ['Ankit', 'Aman', 'Riya', 'Ayush', 'Anushka'],
             'ID': [4001, 4002, np.nan, np.nan, 4010],
             'Department': ['Technical', np.nan, np.nan, np.nan, 'Marketing'],
             'Qualification': [np.nan, np.nan, np.nan, np.nan, np.nan]}

df = pd.DataFrame(data_dict)

df

In [None]:
df.dropna(axis=1, how='all')

#### Dropping Rows/Columns That Contain Missing Values Above A Certain Threshold

In [None]:
# Keep rows that contain at least 2 non-missing values
df.dropna(axis=0, thresh=2)

#### Dropping Rows If Missing Values Are Present Only In Specific Columns

In [None]:
# Drop only those rows where the specified column has a missing value
df.dropna(subset=['ID', 'Department'])

## Replacing Values In DataFrame

In [None]:
import pandas as pd

colors = {'first_set':  ['Green','Green','Green','Blue','Blue','Red','Red','Red'],
          'second_set': ['Yellow','Yellow','Yellow','White','White','Blue','Blue','Blue']
         }

df = pd.DataFrame(colors, columns= ['first_set','second_set'])
df

#### Replacing Values Throughout Entire Dataframe

In [None]:
df.replace(['Blue'],'Green')


#### Replace More Than 1 Value Throughout Entire DataFrame

In [None]:
 df.replace(['Blue','Red'],'Green')


#### Replacing 1 Value in a Column

In [None]:
df['first_set'] = df['first_set'].replace(['Blue'],'Green')
df

#### Replacing Multiple Values in a Column

In [None]:
df['first_set'] = df['first_set'].replace(['Blue','Red'],'Green')
df

#### Replacing More Than 1 Values With Multiple Values

In [None]:
df['first_set'] = df['first_set'].replace(['Blue','Red'],['Green','White'])
df

## Loading Data Sets

#### CSV File

In [None]:
import pandas as pd

df1 = pd.read_csv(r'C:\Users\rquiles\OneDrive - Healthesystems, LLC\Desktop\JN_Practice_Data_Set_Healthe_Week.csv')
df1.head()

#### Excel File


In [None]:
df2 = pd.read_excel(r'C:\Users\rquiles\OneDrive - Healthesystems, LLC\Desktop\JN_Practice_Data_Set_Healthe_Week.xlsx')
df2.head()


In [None]:
#Shows information about the data set
df2.info()

#### SQL Server

In [None]:
import pyodbc
#import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)

conx_string = "driver={SQL SERVER}; server=SQL-ADHOC; database=ProgPerf; trusted_connection=YES"
conx = pyodbc.connect(conx_string)

In [None]:
query = """
 select top 10 * from progperf..abm_billed_data where service_cd = 'trp' and year(date_of_service)='2021'

"""

In [None]:
df3 = pd.read_sql(query, conx)
df3.head()

## Selecting Rows

#### .iloc

In [None]:
df3.iloc[1:5, 2:6]

#### LOC

In [None]:
df3.loc[2:5, ['Customer', 'Date_of_Loss']]

In [None]:
df3.loc[[2,4], :'Date_of_Loss']  #syntax for specific index columns

In [None]:
df3.loc[[2,4], 'Date_of_Loss':]

In [None]:
df3.loc[2:, 'Date_of_Loss':]

In [None]:
df3.loc[:5, 'Date_of_Loss':]

## Selectin Rows Based On Conditions

In [None]:
df3.loc[df3['State_of_Venue']=='CA']

In [None]:
df3.loc[(df3['State_of_Venue']=='CA') & (df3['Customer']=='Hartford')]

#### Selecting A Column

In [None]:
df3[['State_of_Venue']]

#### Selecting Multiple Columns

In [None]:
df3 [['State_of_Venue','Claim_Body_Part']]

## Visualization (Matplotlib)

#### Histogram Plots

In [None]:
#Reconnect DataFrame

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

#styling
plt.style.use('fivethirtyeight')

# change size of the plot
plt.figure(figsize=(15,5))

plt.hist(df1['spend'], bins=100, facecolor = 'blue', edgecolor= 'black', label="spend")
plt.title('Spend')
plt.ylabel('Count')
plt.xlabel('Dollars')
plt.legend()
plt.show



#### Scatter Plots (1 Scatter Plot)

In [None]:
plt.figure(figsize=(15,5))
plt.scatter(df1['spend'], df1['billed_visits'], label='spend')

plt.title('Scatter Plot 1')
plt.ylabel('Billed Visits')
plt.xlabel('Spend')
plt.legend()


#### Scatter Plot (2 Scatter Plot)

In [None]:
plt.figure(figsize=(15,5))

plt.scatter(df1['spend'], df1['billed_visits'], label="spend")
plt.scatter(df1['spend'], df1['actual_visits'], label="spend", c="red")

plt.title('Scatter Plot 2')
plt.ylabel('Visits')
plt.xlabel('Spend')

#can move location of chart legend
plt.legend(loc='upper center')


#### Line Chart

In [None]:
plt.figure(figsize=(15,5))

# x, y variables
ages_x = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]

dev_y = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]

plt.title('Salaries By Age')
plt.ylabel('Salary')
plt.xlabel('Age')

plt.plot(ages_x, dev_y, color="green", label="Salaries")  #color can take hex numbers (#9c27b0)

plt.legend(loc="upper center")  #can move legend around using 'loc'

In [None]:
plt.figure(figsize=(15,5))

# x, y variables
ages_x = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]

dev_y = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
dev_y2 = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]

plt.title('Salaries By Age')
plt.ylabel('Salary')
plt.xlabel('Age')


plt.plot(ages_x, dev_y, color="green", label="Salaries")  #color can take hex numbers (#9c27b0)
plt.plot(ages_x, dev_y2, color='red', label="Salaries2")

plt.legend(loc="upper center")  #can move legend around using 'loc'

#### Bar Charts

In [None]:
plt.figure(figsize=(15,5))

# x, y variables
ages_x = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]

dev_y = [38496, 42000, 46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]

plt.title('Salaries By Age')
plt.ylabel('Salary')
plt.xlabel('Age')


plt.bar(ages_x, dev_y, color="#9c27b0", label="Salaries")  #color can take hex numbers (#9c27b0)

plt.legend(loc="upper center")  #can move legend around using 'loc'

#### Stacked Bar Chart

In [None]:
plt.figure(figsize=(15,5))

# x, y variables
ages_x = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]


dev_y = [45372, 48876, 53850, 57287, 63016, 65998, 70003, 70000, 71496, 75370, 83640]
plt.bar(ages_x, dev_y, color='yellow', label="Salaries")

dev_y2 = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]
plt.bar(ages_x, dev_y2, color="#9c27b0", label="Salaries_2")  #color can take hex numbers (#9c27b0)

plt.title('Salaries By Age')
plt.ylabel('Salary')
plt.xlabel('Age')
plt.legend(loc="upper center")  #can move legend around using 'loc'

#### Side By Side Bar Chart

In [None]:
plt.figure(figsize=(15,5))

# x, y variables
ages_x = [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]
x_shift_index = np.arange(len(ages_x))
width = 0.25

#middle bar chart
dev_y = [38496, 42000,46752, 49320, 53200, 56000, 62316, 64928, 67317, 68748, 73752]
plt.bar(x_shift_index - width, dev_y, width = width, color='blue', label="Salaries")

dev_y2 = [45372, 48876, 53850, 57287, 63016, 65998, 70003, 70000, 71496, 75370, 83640]
plt.bar(x_shift_index, dev_y2, width = width, color='yellow', label="Salaries_2")

dev_y3 = [37810, 43515, 46823, 49293, 53437, 56373, 62375, 66674, 68745, 68746, 74583]
plt.bar(x_shift_index + width, dev_y3, width = width, color="#9c27b0", label="Salaries_3")  #color can take hex numbers (#9c27b0)

plt.title('Salaries By Age')
plt.ylabel('Salary')
plt.xlabel('Age')
#plt.xticks(ticks=x_shift_index, labels=ages_x)
plt.legend(loc="best")

## Summary/Descriptive Statistics

#### Basic Statistics Methods

In [None]:
#Finds the a columns count
df1['spend'].count()

In [None]:
#Finds the minimum number in column
df1['spend'].min()

In [None]:
#Finds the max number in a column
df1['spend'].max()

In [None]:
#Average of spend column

df1['spend'].mean()

In [None]:
#Median of spend column
df1['spend'].median()

In [None]:
#Mode of spend columns
df1['spend'].mode()

In [None]:
#Standard Deviation of spend column

#rounding number to 'x' number of digits
round(df1['spend'].std(),2)

#### Percentiles

In [None]:
#Using numpy
np.percentile(df1['spend'], q=[25,50,75])

In [None]:
#Using .quantile method on df
df1['spend'].quantile(q=[0.25])
#df['spend'].quantile(q=[0.25, 0.50])


#### One Stop Shop Descriptive Stats

In [None]:
df1['spend'].describe()

## An Example Using Summary/Descriptive Statistics

In [None]:
import pandas as pd
import numpy as np

#create DataFrame
df = pd.DataFrame({'team': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'],
                   'points': [18, 22, 19, 14, 14, 11, 20, 28, 30],
                   'assists': [5, np.nan, 7, 9, 12, 9, 9, 4, 5],
                   'rebounds': [11, 8, 10, 6, 6, 5, 9, np.nan, 6]})

#view DataFrame
df

In [None]:
#Using .describe() method on an entire dataframe
df.describe()

In [None]:
#Use 'include='object' parameter to get description data on string columns

df.describe(include='object')

#### Group By & Summary Statistics

In [None]:
#Using summary statistics with groupby
df.groupby('team').mean()

In [None]:
df.groupby('team').median()

In [None]:
df.groupby('team').std()
#round(df.groupby('team').std(),2)

## Iterating Over Rows In A DataFrame

#### For Loop Example

In [None]:
a_list = [1,2,3,4,5]

for i in range(len(a_list)):
    print(a_list[i] * 2)

In [None]:
df = pd.DataFrame({'Num':range(1, 10),    # Create pandas DataFrame
                     'Letter':['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'],
                  'Color': ['yellow', 'green', 'blue', 'organge', 'pink', 'red', 'orange', 'violet', 'purple']})
df    

#### Looping Over Rows In A DataFrame

In [None]:
for i, row in df.iterrows():
    print('This is the index:', i, 'Column 1:', row['Num'], 'Column 2:',row['Letter'], 'Column 3:', row['Color'])

#### Performing Calculations Looping Over Rows

In [None]:
for i, row in df.iterrows():            # Use iterrows to calculate by row
    print('Double', row['Num'], 'is:', row['Num'] * 2)

#### Using Iterrows To Create A Column

In [None]:
for i,row in df.iterrows():
    df.loc[i, 'Random'] = np.random.randint(0, 10)  #example using np.random to create random numbers 0 thru 10 (1 through 9)
df

## Machine Learning Examples (Supervised)

### KNN Breast Cancer Prediction
#####  Determine whether a patient has Cancer

In [None]:
import sklearn.datasets
data = sklearn.datasets.load_breast_cancer()
data
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import numpy as np

In [None]:
X = data.data
print(X.shape)

In [None]:
X

In [None]:
y = data.target


In [None]:
# Putting dataset into DataFrame

data1 = pd.DataFrame(data.data, columns = data.feature_names)
data1.columns

In [None]:
data1.head()

In [None]:
#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state =4) # 60% training and 40% test

In [None]:
#Finding a the best value for K
#For to to calculate K from 1 through 25 and record testing accuracy
k_range = range(1,25)
scores = []
for k in k_range:
 knn = KNeighborsClassifier(n_neighbors = k)
 knn.fit(X_train, y_train)
 y_pred = knn.predict(X_test)
 scores.append(metrics.accuracy_score(y_test,y_pred))

In [None]:
# Plot relationship between K and testing accuracy
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')

In [None]:

# Create a k-NN classifier with 5 neighbors
knn = KNeighborsClassifier(n_neighbors=7) #According to graph, best K values ~ 4,6,7,8,9
# Fit the classifier to the training data
knn.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = knn.predict(X_test)
print(y_pred)


In [None]:
print (round(metrics.accuracy_score(y_test, y_pred),2))

### Linear Regression Boston Housing Market
###### Single variable LR model to predict Boston Home prices based on the number of rooms.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import datasets

In [None]:
data = datasets.load_boston()
print(data.DESCR)

In [None]:
data = datasets.load_boston()

In [None]:
boston = pd.DataFrame(data.data, columns = data.feature_names)
boston.head()


In [None]:
# Adding Home Values(target array) to the data frame.
boston['MEDV']=data.target
boston.head()

In [None]:
boston.shape

In [None]:
# Use heat map to find best correlating variables to perfrom regression
plt.figure(figsize=(15,5))


correlation_matrix = boston.corr().round(2)
sns.heatmap(data=correlation_matrix, annot=True)

In [None]:
#Create and reshape feature(X) and target(y) variables.  Need to be in same dimensions.
X = boston['RM'].values.reshape(-1,1)
y = boston['MEDV'].values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
#Plotting to visual size relationship between X,y variables
marker_size=5
plt.scatter(X,y, marker_size)
plt.xlabel('Number of Rooms')
plt.ylabel('Home Price')



In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
 # Train, test, and fitting the linear regression model.
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=.3, random_state=4)
reg1 = LinearRegression()
reg1.fit(X_train, y_train)

In [None]:
y_pred = reg1.predict(X_test)
y_pred

In [None]:
# Calculating R^2 and RMSE model evalution metrics
print("R^2: {}".format(reg1.score(X_test, y_test)))  #measure how well the independent variable can predict the dependent variable
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))  #measure how tight data point are along the line of best fit(regression line)
print('RMSE: {}'.format(RMSE))

# Low R^2 score, may need to tune to model to increase score

#### Calculate The Regression Formula to Predict Prices

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(y_test,y_pred,color='Red',marker='*')

m, b = np.polyfit(np.concatenate(y_test), y_pred, 1)

y_values = []
for i in range(len(y_test)):
    y = m*y_test[i] + b
    y_values.append(y)
plt.plot(y_test, y_values, linewidth=1, color='blue')


plt.xlabel("Prices",{'size':20})
plt.ylabel("Predictions",{'size':20})
plt.title("Predicted Prices vs Prices",{'size':20})

print ('m:', m)
print('b: ', b)


### Logistic Regression
##### Predicting Credit Card Fraud

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r'C:\Users\rquiles\OneDrive - Healthesystems, LLC\Desktop\creditcard.csv')
df.head()

In [None]:
print('# of fraud:', df['Class'].value_counts()[1])
print('# of non-fraud:', df['Class'].value_counts()[0])
#print(df['Class'].value_counts())     

In [None]:
import numpy as np
class_x = [1,0]
x_index = np.arange(len(class_x))

x = [1, 0]
y = [df['Class'].value_counts()[1], df['Class'].value_counts()[0]]
plt.bar(x, y)
plt.xticks(ticks=x_index, labels=class_x)

In [None]:
#Dropping the predict variable "Class" from the dataset

X = df.drop(['Class'], axis = 1)
y = df['Class']

print(X.shape)
print(y.shape)

In [None]:
#Seperating Data Set into Testing and Training Data Sets

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
#Setting up the pipeline.  pipeline is used to chain steps together.  good when you need to preprocess data on the fly

scaler = StandardScaler()  #Standard scales to scale numerical values between 0 and 1

lr = LogisticRegression()

model = make_pipeline(scaler, lr)



In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred_probs = model.predict_proba(X_test)[:,1]
print(y_pred_probs)

In [None]:
#test_accuracy = accuracy_score(y_test, y_pred)*100
test_auc_roc = roc_auc_score(y_test, y_pred)*100

print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))

print('Testing AUC: %.2f %%' % test_auc_roc)  
#print('Testing accuracy: %.4f %%' % test_accuracy) 

"""
Confusion Matrix used to measure performance of classification model.  Good Model = high TP & TN, low FP & FN.  Good for imbalance dataset
TP  FP
FN  TN

columns = actual Values, rows = predicted values
"""

### AUC = area under curve; ROC = receiver operating characteristic

In [None]:
tpr, fpr, _ = roc_curve(y_test, y_pred_probs)

x1 = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
y1 = [0, 0.2, 0.4, 0.6, 0.8, 1.0]

plt.plot(tpr,fpr)
plt.plot(x1,y1, color='red', marker='_')

plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')


In [None]:
""" The relationship between TRP and FRP. auc-roc tells you how well the model is capable of distinguishing between classes, 
in this case 0 & 1. Measure of seperability  Score of 0.5 means no ability to tell between 0 & 1"""

In [None]:
#print(classification_report(y_test, y_pred, digits=6))

### Logistic Regression on Titanic Data Set

In [None]:
df = pd.read_csv('http://bit.ly/kaggletrain')
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Embarked']]
df.head()

In [None]:
df.shape

In [None]:
X = df.loc[:, ['Pclass']]
y = df['Survived']

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver = 'lbfgs')

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(lr, X, y, cv=5, scoring='accuracy').mean()

### How does performance change adding columns
#### Encoding Categorical Values

In [None]:
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder(sparse=False)  #sparse False to return an array

In [None]:
one.fit_transform(df[['Sex']])

#first column represents female, second column male

In [None]:
one.categories_

In [None]:
one.fit_transform(df[['Embarked']])

In [None]:
one.categories_

In [None]:
X = df.drop('Survived', axis = 1)
X.head()

In [None]:
from sklearn.compose import make_column_transformer #use when data in dataframe need different preprocessing

column_trans = make_column_transformer((OneHotEncoder(), ['Sex', 'Embarked']), remainder='passthrough')

In [None]:
column_trans.fit_transform(X)  #first 2 column are One Hot Encoded Sex,  the next 3 columns are One Hot Encoded Embarked, and 
                                # last column is 'passthrough column Pclass'

In [None]:
from sklearn.pipeline import make_pipeline  #pipeline is for chaining steps together

In [None]:
pipe = make_pipeline(column_trans, lr)

In [None]:
cross_val_score(pipe, X, y, cv =5 , scoring='accuracy').mean()  #cross val 

In [None]:
X_new = X.sample(5, random_state=99)
X_new

In [None]:
pipe.fit(X,y)

In [None]:
pipe.predict(X_new)