# Import libraries

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

# Load the dataset

In [None]:
df= pd.read_csv("https://raw.githubusercontent.com/Premalatha-success/Datasets/main/TaxiFare.csv")

In [None]:
df.shape

In [None]:
df.sample(10)

# Data Wrangling

In [None]:
df=df.drop('unique_id', axis=1)  
print(df.shape)
df.head()

In [None]:
df.dtypes

In [None]:
#change to datetime format
df["date_time_of_pickup"]= pd.to_datetime(df["date_time_of_pickup"])
df.head()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
#take individual values
df['hour'] = df['date_time_of_pickup'].dt.hour
df['minute'] = df['date_time_of_pickup'].dt.minute
df['date'] = df['date_time_of_pickup'].dt.day
df['day'] = df['date_time_of_pickup'].dt.dayofweek
df['month'] = df['date_time_of_pickup'].dt.month
df['year'] = df['date_time_of_pickup'].dt.year

In [None]:
df.head()

In [None]:
df=df.drop('date_time_of_pickup', axis=1)
df.head()

In [None]:
df.boxplot(column=['latitude_of_pickup'])
plt.show()

In [None]:
#latitude is exceeding +90
df['latitude_of_pickup']= np.where(df['latitude_of_pickup']>90, np.nan, df['latitude_of_pickup'])
df.dropna(inplace=True)
df.shape

In [None]:
#to find the distance between 2 points on earth
from geopy.distance import geodesic as GD

def Distance(a):
    lon1=a[0]
    lat1=a[1]
    lon2=a[2]
    lat2=a[3]
    
    cord1=(lat1,lon1)
    cord2=(lat2,lon2)
    
    return GD(cord1,cord2).km
    

In [None]:
df['distance'] = df[['longitude_of_pickup','latitude_of_pickup','longitude_of_dropoff','latitude_of_dropoff']].apply(Distance,axis=1)

In [None]:
df.drop(['longitude_of_pickup','latitude_of_pickup','longitude_of_dropoff','latitude_of_dropoff'], axis=1, inplace=True)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#remove negative values
df['amount']= np.where(df['amount']<0, np.nan, df['amount'])
df.dropna(inplace=True)
df.shape

In [None]:
sns.countplot(x='no_of_passenger', data=df)

In [None]:
sns.pairplot(df, diag_kind='kde')

# Handling duplicates

In [None]:
duplicate=df.duplicated()
print(duplicate.sum())
df[duplicate]

In [None]:
df.describe()

# Handling Outliers

In [None]:
df.boxplot(column=['no_of_passenger'])
plt.show()

In [None]:
#replace the outliers using IQR
#def remove_outlier(col)
#user defined function called remove_outlier for getting the threshold value from IQR

def remove_outlier(col):
    sorted (col)
    Q1,Q3= col.quantile([0.25,0.75])
    IQR=Q3-Q1
    lower_range= Q1- (1.5*IQR)
    upper_range= Q3+ (1.5*IQR)
    return lower_range, upper_range

In [None]:
lowno,uppno = remove_outlier(df['no_of_passenger'])
df['no_of_passenger']= np.where(df['no_of_passenger']>uppno, uppno, df['no_of_passenger'])
df['no_of_passenger']= np.where(df['no_of_passenger']<lowno, lowno, df['no_of_passenger'])

In [None]:
df.boxplot(column=['no_of_passenger'])
plt.show()

# Scaling

In [None]:
from scipy.stats import zscore
df_z = df.apply(zscore)
df_z= pd.DataFrame(df_z, columns= 'amount no_of_passenger hour minute date day month year distance'.split())

In [None]:
df.head()

In [None]:
#from sklearn.preprocessing import StandardScaler
#std_scale= StandardScaler()
#std_scale

In [None]:
#df['no_of_passenger']= std_scale.fit_transform(df[['no_of_passenger']])
#df['hour']= std_scale.fit_transform(df[['hour']])
#df['minute']= std_scale.fit_transform(df[['minute']])
#df['date']= std_scale.fit_transform(df[['date']])
#df['day']= std_scale.fit_transform(df[['day']])
#df['month']= std_scale.fit_transform(df[['month']])
#df['year']= std_scale.fit_transform(df[['year']])
#df['distance']= std_scale.fit_transform(df[['distance']])

In [None]:
#df.head()

# Split Data and Train

In [None]:
X= df.drop(['amount'], axis=1)
y= df[['amount']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Approach 1

In [None]:
model= LinearRegression() 
model.fit(X_train, y_train)

In [None]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

# approch2

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly=PolynomialFeatures(degree=2, interaction_only=True) 
X_train2 = poly.fit_transform(X_train)
X_test2 = poly.fit_transform(X_test)

poly_clf= linear_model.LinearRegression()
poly_clf.fit(X_train2, y_train)

In [None]:
print(poly_clf.score(X_train2, y_train))
print(poly_clf.score(X_test2, y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)

In [None]:
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

# Approach 3

In [None]:
from sklearn.svm import SVR 

In [None]:
svr_model= SVR(C=2, gamma=0.25, kernel='rbf')
svr_model.fit(X_train, y_train)

In [None]:
print(svr_model.score(X_train, y_train))
print(svr_model.score(X_test, y_test))

# approch 4

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
dtree= DecisionTreeRegressor()
dtree.fit(X_train, y_train)

In [None]:
print(dtree.score(X_train, y_train))
print(dtree.score(X_test, y_test))

# approach 5

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
bgr= BaggingRegressor()
bgr.fit(X_train, y_train)

In [None]:
print(bgr.score(X_train, y_train))
print(bgr.score(X_test, y_test))

# approach6

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
abr= AdaBoostRegressor()
abr.fit(X_train, y_train)

In [None]:
print(abr.score(X_train, y_train))
print(abr.score(X_test, y_test))