# LIBRARIES NEEDED IN THE STUDY

In [1]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn library

# Scaling
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

# Reading the Data

In [2]:
URL = "https://github.com/jahongirkb/prediction_air_ticket/blob/main/train_data.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


# Dividing train test split

In [3]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=10)

X_train = train_set.drop(["price", 'flight', 'id'], axis=1)
y = train_set["price"].copy()

X_num = X_train[["duration", 'days_left']]

# Make pipeline

for numeric

In [4]:
num_pipeline = Pipeline([
          ('std_scaler', StandardScaler())             
])

for categorical

In [5]:
num_attribs = ["duration", 'days_left']
cat_attribs = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [6]:
X_prepared = full_pipeline.fit_transform(X_train)

In [7]:
X_prepared

<16000x37 sparse matrix of type '<class 'numpy.float64'>'
	with 144000 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

RandomForestRegressor()

In [9]:
test_data = X_train.sample(5)
test_data

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
5135,Vistara,Mumbai,Morning,one,Afternoon,Delhi,Economy,5.08,10
5890,SpiceJet,Kolkata,Night,one,Morning,Mumbai,Economy,10.67,29
7634,Vistara,Kolkata,Morning,one,Afternoon,Hyderabad,Business,25.83,26
11640,Indigo,Kolkata,Evening,one,Late_Night,Hyderabad,Economy,7.92,38
747,Air_India,Delhi,Early_Morning,one,Late_Night,Bangalore,Economy,19.0,10


In [10]:
test_label = y.loc[test_data.index]
test_label

5135     12138
5890      6276
7634     49207
11640     8380
747      11520
Name: price, dtype: int64

In [11]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

<5x37 sparse matrix of type '<class 'numpy.float64'>'
	with 45 stored elements in Compressed Sparse Row format>

In [12]:
predicted_data = RF_model.predict(test_data_prepared)
predicted_data

array([12298.08,  5963.27, 49628.47,  7474.72, 12119.48])

In [13]:
pd.DataFrame({'Prediction':predicted_data, 'Real value': test_label})

Unnamed: 0,Prediction,Real value
5135,12298.08,12138
5890,5963.27,6276
7634,49628.47,49207
11640,7474.72,8380
747,12119.48,11520


In [14]:
test_set

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
19778,19779,GO_FIRST,G8-103,Delhi,Evening,zero,Night,Kolkata,Economy,2.33,5,5954
4376,4377,Indigo,6E-2005,Delhi,Morning,zero,Morning,Hyderabad,Economy,2.25,42,3013
10188,10189,AirAsia,I5-721,Delhi,Night,one,Morning,Hyderabad,Economy,12.00,21,2050
9887,9888,Vistara,UK-809,Delhi,Evening,one,Morning,Hyderabad,Economy,13.25,34,5761
4441,4442,AirAsia,I5-517,Chennai,Morning,zero,Morning,Hyderabad,Economy,1.17,7,4203
...,...,...,...,...,...,...,...,...,...,...,...,...
13123,13124,Vistara,UK-910,Mumbai,Evening,one,Night,Hyderabad,Business,5.33,21,61597
18706,18707,Indigo,6E-5398,Mumbai,Morning,one,Evening,Chennai,Economy,8.00,7,10731
7274,7275,AirAsia,I5-550,Delhi,Evening,one,Early_Morning,Hyderabad,Economy,13.92,18,2050
16155,16156,Indigo,6E-2247,Delhi,Morning,one,Night,Bangalore,Economy,9.75,31,4007


In [15]:
X_test = test_set.drop(['price', 'flight', 'id'], axis=1)
X_test

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
19778,GO_FIRST,Delhi,Evening,zero,Night,Kolkata,Economy,2.33,5
4376,Indigo,Delhi,Morning,zero,Morning,Hyderabad,Economy,2.25,42
10188,AirAsia,Delhi,Night,one,Morning,Hyderabad,Economy,12.00,21
9887,Vistara,Delhi,Evening,one,Morning,Hyderabad,Economy,13.25,34
4441,AirAsia,Chennai,Morning,zero,Morning,Hyderabad,Economy,1.17,7
...,...,...,...,...,...,...,...,...,...
13123,Vistara,Mumbai,Evening,one,Night,Hyderabad,Business,5.33,21
18706,Indigo,Mumbai,Morning,one,Evening,Chennai,Economy,8.00,7
7274,AirAsia,Delhi,Evening,one,Early_Morning,Hyderabad,Economy,13.92,18
16155,Indigo,Delhi,Morning,one,Night,Bangalore,Economy,9.75,31


In [16]:
y_test = test_set['price'].copy()
y_test

19778     5954
4376      3013
10188     2050
9887      5761
4441      4203
         ...  
13123    61597
18706    10731
7274      2050
16155     4007
16712    13128
Name: price, Length: 4000, dtype: int64

In [17]:
X_test_prepared = full_pipeline.transform(X_test)

In [18]:
y_predicted = RF_model.predict(X_test_prepared)

In [19]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

3883.246810739448


In [20]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [23]:
display_scores(LR_rmse_scores)

Scores: [4087.75541674 4056.55720348 3864.40239232 4126.88271586 3615.7160062
 3711.52349633 3711.75751257 3836.18360743 3730.17248205 4081.5088202 ]
Mean: 3882.2459653175597
Std.dev: 180.82824411493993


In [24]:
URL = "https://github.com/jahongirkb/prediction_air_ticket/blob/main/test_data.csv?raw=true"
df_test = pd.read_csv(URL)
df_test.head()

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
1,2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
2,3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.0,30
3,4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
4,5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35


In [25]:
df_test.shape

(5000, 11)

In [27]:
df_test = df_test.drop(['flight', 'id'],  axis=1)

In [28]:
df_test_prepared = full_pipeline.transform(df_test)

In [29]:
df_predicted = RF_model.predict(df_test_prepared)

In [30]:
df_predicted

array([52854.79, 58032.77, 23352.5 , ..., 50281.58, 47340.83, 62935.17])

In [31]:
df_predicted.shape

(5000,)

In [41]:
URL = "https://github.com/jahongirkb/prediction_air_ticket/blob/main/sample_solution.csv?raw=true"
df_sample = pd.read_csv(URL)
df_sample.head()

Unnamed: 0,id,price
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [33]:
df_sample.shape

(5000, 2)

In [35]:
df_sample['price'] = df_predicted

In [36]:
df_sample.head()

Unnamed: 0,id,price
0,1,52854.79
1,2,58032.77
2,3,23352.5
3,4,2684.11
4,5,5619.72


In [39]:
df_sample.to_csv("my_submission.csv", index=False)

In [40]:
URL = "/content/my_submission.csv"
df_sub = pd.read_csv(URL)
df_sub.head()

Unnamed: 0,id,price
0,1,52854.79
1,2,58032.77
2,3,23352.5
3,4,2684.11
4,5,5619.72
