# Linear Regression

Linear regression analysis is used to predict the value of a variable based on the value of another variable. The variable you want to predict is called the dependent variable. The variable you are using to predict the other variable's value is called the independent variable

This week, your task involves conducting multi-class linear regression on batsmen salaries. You'll use the average runs scored per game and the strike rate as independent variables. The goal is to predict the salary as the dependent variable. Additionally, you'll be categorizing the data based on the years.

The dataset is Data_Mendeley.csv given on GitHub. Feel free to create any new functions required.

In [1]:
#import important libraries
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing

preparing data

In [2]:
#mounting gdrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Forward pass

In [3]:
def forward(x):

  pass

Mean Squared Loss

In [4]:
def loss(y,y_pred): #Mean Squared Loss
  n=len(y)
  err=0.0
  for i in range(n):
    err=err+(y_pred[i]-y[i])**2
  return err/n


Implement Linear regression here :)

In [5]:
def LinearRegression(xpoints, ypoints, zpoints, n):
  m1=0.0
  m2=0.0
  b=0.0
  L=0.01
  for _ in range(10000):
    m1 = m1 - L * (-2) * (np.dot(xpoints, zpoints-m2*ypoints-m1*xpoints-b))/float(n)
    m2 = m2 - L * (-2) * (np.dot(ypoints, zpoints-m2*ypoints-m1*xpoints-b))/float(n)
    b = b - L * (-2) * (sum(zpoints-m2*ypoints-m1*xpoints-b))/float(n)

  return m1, m2, b


data=pd.read_csv('/content/drive/MyDrive/Data_Mendeley.csv')
years=data["Year"]
roles=data["Role"]
maxyr=max(years)
minyr=min(years)
runs=data["Ave"]
strrate=data["StrRate"]
salaries=data["Final Price"]


for yr in range(minyr, maxyr+1):
  runs_this_year=np.array([])
  strrate_this_year=np.array([])
  salaries_this_year=np.array([])
  n=0.0
  for i in range(len(years)):
    if years[i]==yr and roles[i]=="Batsman":
      runs_this_year=np.append(runs_this_year, [float(runs[i])])
      strrate_this_year=np.append(strrate_this_year, [float(strrate[i])])
      salaries_this_year=np.append(salaries_this_year, [float(salaries[i])])
      n+=1
  runs_this_year=preprocessing.normalize([runs_this_year])
  strrate_this_year=preprocessing.normalize([strrate_this_year])
  salaries_this_year=preprocessing.normalize([salaries_this_year])
  runs_this_year=runs_this_year.reshape(int(n))
  strrate_this_year=strrate_this_year.reshape(int(n))
  salaries_this_year=salaries_this_year.reshape(int(n))

  m1, m2, b = LinearRegression(runs_this_year, strrate_this_year, salaries_this_year, n)

  salary_pred=m1*runs_this_year + m2*strrate_this_year + b
  error=loss(salaries_this_year, salary_pred)

  print("For year:", yr)
  print("m1 =", m1, " m2 =", m2, " b =", b)
  print("Error =", error)
  print("")



For year: 2008
m1 = -0.023071011411711376  m2 = 0.744116451632194  b = 0.008350217249844336
Error = 0.026152152811234657

For year: 2009
m1 = -0.26669382184327883  m2 = 0.11881857560493586  b = 0.17400246903291564
Error = 0.03198033369815301

For year: 2010
m1 = 0.6720158280141446  m2 = 0.3662217454242011  b = -0.07824574398557142
Error = 0.021205042863143068

For year: 2011
m1 = 0.4286766036545212  m2 = 0.24554620974644817  b = 0.02182190057900566
Error = 0.014600954638789502

For year: 2012
m1 = 0.22097315263696868  m2 = 0.303840019887323  b = 0.04142239986652525
Error = 0.018700677870412904

For year: 2013
m1 = 0.2562794079998437  m2 = 0.14486135808314593  b = 0.06126429990774155
Error = 0.019277090171395266

For year: 2014
m1 = 0.33503267608195236  m2 = 0.16062715165468472  b = 0.05202889926611921
Error = 0.012851008943221592

For year: 2015
m1 = 0.514394799977797  m2 = 0.1420633791354644  b = 0.016312442121581873
Error = 0.010935894990841221

For year: 2016
m1 = 0.5244767332100538

# Logistic Regression

Logistic regression is a process of modeling the probability of a discrete outcome given an input variable. The most common logistic regression models a binary outcome; something that can take two values such as true/false, yes/no, and so on.

In this week you will be doing logistic regression on breast cancer dataset using sklearn library. Feel free to create any new functions required.

In [6]:
#importing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

Prepare Data

In [7]:
breast_cancer = datasets.load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target

In [8]:
#spliting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
sc = StandardScaler()
X_train = np.array(sc.fit_transform(X_train))
X_test = np.array(sc.transform(X_test))

Forward pass

In [9]:
def forward_log(x):
  pass

Binary cross entropy loss

In [10]:
def BCELoss(y,y_pred):
  pass

Implement Logistic Regression here :)

In [11]:
def sigmoid(x):
  return 1/(1+np.exp(-x))

class LogisticRegression():

  def __init__(self, L=0.01):
    self.L=L
    self.weights= None
    self.bias = None

  def fit(self, X, y):
    nsamples, nfeatures = X.shape
    self.weights=np.zeros(nfeatures)
    self.bias=0

    for _ in range(10000):
      linear_pred=np.dot(X, self.weights) + self.bias
      pred = sigmoid(linear_pred)

      self.weights=self.weights-self.L * (1/nsamples)*np.dot(X.T, (pred-y))
      self.bias=self.bias-self.L * (1/nsamples)*sum(pred-y)

  def predict(self, X):
    linear_pred=np.dot(X, self.weights)+self.bias
    y_pred=sigmoid(linear_pred)
    class_pred=[0 if y<=0.5 else 1 for y in y_pred]
    return class_pred

obj=LogisticRegression(L=0.1)
obj.fit(X_train, y_train)
y_pred = obj.predict(X_test)

def accuracy(y_pred, y_test):
  return np.sum(y_pred==y_test)/len(y_test)

acc=accuracy(y_pred, y_test)
print(acc)


0.9649122807017544
