In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [33]:
data = pd.read_csv('Top Indian Places to Visit.csv')

In [34]:
data.head()

Unnamed: 0.1,Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Weekly Off,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit
0,0,Northern,Delhi,Delhi,India Gate,War Memorial,1921,0.5,4.6,0,Yes,,Historical,Yes,2.6,Evening
1,1,Northern,Delhi,Delhi,Humayun's Tomb,Tomb,1572,2.0,4.5,30,Yes,,Historical,Yes,0.4,Afternoon
2,2,Northern,Delhi,Delhi,Akshardham Temple,Temple,2005,5.0,4.6,60,Yes,,Religious,No,0.4,Afternoon
3,3,Northern,Delhi,Delhi,Waste to Wonder Park,Theme Park,2019,2.0,4.1,50,Yes,Monday,Environmental,Yes,0.27,Evening
4,4,Northern,Delhi,Delhi,Jantar Mantar,Observatory,1724,2.0,4.2,15,Yes,,Scientific,Yes,0.31,Morning


In [35]:
class OrdinalEncoderAndStandardScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mean=None, var=None, encoding_dict=None):
        self.mean = mean
        self.var = var
        self.encoding_dict = encoding_dict

    def fit(self, x, y=None):
        self.ordinal_encoder = OrdinalEncoder()
        self.scaler = StandardScaler()
        return self

    def transform(self, x, y=None):
        series_name = x.name
        _x = x.to_numpy().reshape(-1, 1)
        _x = self.ordinal_encoder.fit_transform(_x)
        categories = self.ordinal_encoder.categories_
        self.encoding_dict = dict(zip((categories[0]), range(len(categories[0]))))
        _x = np.squeeze(self.scaler.fit_transform(_x))
        self.mean = self.scaler.mean_[0]
        self.var = self.scaler.var_[0]
        return pd.Series(_x, name=series_name)

In [36]:
def convert_to_numerical(column):
    column_name = column
    transformer = OrdinalEncoderAndStandardScalerTransformer()
    pipeline = Pipeline([('transform', transformer)])
    transformed_column = pipeline.fit_transform(data[column_name])
    data[column_name] = transformed_column

In [37]:
categorical_list = ['Zone', 'State', 'City', 'Name', 'Type','Establishment Year', 'Airport with 50km Radius', 'Weekly Off', 'Significance', 'DSLR Allowed', 'Best Time to visit']
for i in categorical_list:
    convert_to_numerical(i)

In [38]:
data.head()

Unnamed: 0.1,Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Weekly Off,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit
0,0,0.084472,-1.089446,-0.766887,-0.342843,1.483047,-0.39229,0.5,4.6,0,0.657053,0.192863,-0.555718,0.475831,2.6,0.915825
1,1,0.084472,-1.089446,-0.766887,-0.38581,1.09766,-1.419242,2.0,4.5,30,0.657053,0.192863,-0.555718,0.475831,0.4,-1.133353
2,2,0.084472,-1.089446,-0.766887,-1.653335,0.969198,0.539574,5.0,4.6,60,0.657053,0.192863,0.455527,-2.101587,0.4,-1.133353
3,3,0.084472,-1.089446,-0.766887,1.665861,1.05484,0.691715,2.0,4.1,50,0.657053,-2.21792,-0.960216,0.475831,0.27,0.915825
4,4,0.084472,-1.089446,-0.766887,-0.224684,-0.0585,-1.057907,2.0,4.2,15,0.657053,0.192863,0.860025,0.475831,0.31,1.42812


In [41]:
print(data.isnull().sum())

Unnamed: 0                          0
Zone                                0
State                               0
City                                0
Name                                0
Type                                0
Establishment Year                  0
time needed to visit in hrs         0
Google review rating                0
Entrance Fee in INR                 0
Airport with 50km Radius            0
Weekly Off                          0
Significance                        0
DSLR Allowed                        0
Number of google review in lakhs    0
Best Time to visit                  0
dtype: int64


In [42]:
data.corr()

Unnamed: 0.1,Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Weekly Off,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit
Unnamed: 0,1.0,-0.321936,-0.009088,0.16258,0.062748,0.052263,0.235976,-0.137945,0.155598,-0.092472,-0.130206,0.034947,0.113445,-0.107099,-0.08183,-0.079319
Zone,-0.321936,1.0,-0.400719,0.034505,0.085218,-0.1956,0.067434,0.142247,-0.108166,0.103037,0.008706,0.064219,-0.013449,-0.004752,0.015034,-0.001883
State,-0.009088,-0.400719,1.0,-0.021195,-0.080918,0.086484,-0.14221,-0.092728,0.007928,-0.04804,0.098845,-0.020247,0.041115,-0.132323,0.081643,0.023288
City,0.16258,0.034505,-0.021195,1.0,0.223509,0.124352,0.07428,-0.109737,0.037339,0.033703,-0.054346,0.053498,0.107062,-0.119536,-0.054187,-0.054349
Name,0.062748,0.085218,-0.080918,0.223509,1.0,0.012432,0.087944,0.051058,0.059076,0.036249,0.043201,-0.032921,-0.084085,-0.016525,-0.050715,0.06151
Type,0.052263,-0.1956,0.086484,0.124352,0.012432,1.0,-0.062401,-0.047154,0.199803,-0.117585,-0.147314,-0.069079,0.11733,-0.297492,0.012412,-0.065491
Establishment Year,0.235976,0.067434,-0.14221,0.07428,0.087944,-0.062401,1.0,0.170813,-0.086544,0.063152,-0.048259,0.066771,0.098956,0.162985,-0.082848,0.141484
time needed to visit in hrs,-0.137945,0.142247,-0.092728,-0.109737,0.051058,-0.047154,0.170813,1.0,-0.195736,0.281676,-0.074976,-0.091776,0.106977,0.187748,-0.010589,-0.096262
Google review rating,0.155598,-0.108166,0.007928,0.037339,0.059076,0.199803,-0.086544,-0.195736,1.0,-0.048511,-0.016065,0.125543,-0.062956,-0.333549,0.06717,-0.037629
Entrance Fee in INR,-0.092472,0.103037,-0.04804,0.033703,0.036249,-0.117585,0.063152,0.281676,-0.048511,1.0,-0.003728,-0.00312,-0.045819,0.088778,0.045612,0.00955


In [47]:
x = data.drop(columns = ['Unnamed: 0','State','Type', 'Google review rating', 'Airport with 50km Radius', 'Significance','Entrance Fee in INR'])
y = data['Entrance Fee in INR']

In [82]:
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

x_train shape: (260, 9)
y_train shape: (65, 9)
x_test shape: (260,)
y_test shape: (65,)


In [88]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [89]:
linear_model = LinearRegression()
logistic_model = LogisticRegression()
tree_model  = DecisionTreeClassifier()

In [90]:
linear_model.fit(x_train, y_train)
logistic_model.fit(x_train, y_train)
tree_model.fit(x_train, y_train)

In [94]:
linear = linear_model.predict(x_test)
logistic = logistic_model.predict(x_test)
tree = tree_model.predict(x_test)

In [95]:
print("linear model prediction: ", linear)

linear model prediction:  [ 1.14974925e+01  7.75532514e+02  2.05874492e+01  1.03048384e+02
 -8.35718016e+01  7.79603932e+01  8.68693830e+01  6.18007703e+02
 -4.04331272e+01  5.16171768e+02  1.96969162e+02  1.68810414e+01
  4.43287935e+02  2.06174759e+02  2.89286744e+02  8.81601514e+01
 -1.17769517e+02  2.40164822e+02  5.70074145e+02  2.74103648e+02
  3.44134171e+02  6.48772377e+01  6.96430349e+00  9.19716321e+01
  1.43800140e+02  5.15366723e+02  1.44514355e+02  9.22015442e+01
  1.19436593e+02 -1.53118502e+02  2.28028307e+02 -1.08109397e+02
  1.67326501e+02  3.08706272e+02 -4.72361891e+01  7.84424560e+01
  3.05987010e+02  2.80067548e+02  1.18570688e+02  6.46544134e+02
  1.14470841e+02  1.05836476e+02 -8.38266475e+00  3.11800548e+02
  2.47845335e+01  1.38662980e+02 -1.16512783e+02  2.41329276e+02
  2.58425535e+01  3.38117578e+02  3.00616189e+02  5.40387465e+01
  1.84471779e+02  2.38324986e+02  9.76121208e+02  8.20136869e+01
 -4.21108176e-01 -5.23465714e+00  1.69242185e+02  3.90936695e+02

In [101]:
print("accuracy score:")
print("linearregression model:",linear_model.score(x_test, y_test))
print("logisticregression model:" , logistic_model.score(x_test, y_test))
print("Decisiontreeclassifier model:", tree_model.score(x_test, y_test))

accuracy score:
linearregression model: 0.25891624335371566
logisticregression model: 0.5230769230769231
Decisiontreeclassifier model: 0.3076923076923077
