In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
# train data
train = pd.read_csv('datasets/train_data.csv')
train

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
1,2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
2,3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
3,4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
4,5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.00,4,10712
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,Indigo,6E-6178,Bangalore,Night,one,Early_Morning,Mumbai,Economy,7.92,45,3153
19996,19997,AirAsia,I5-582,Kolkata,Morning,one,Afternoon,Delhi,Economy,5.83,24,3911
19997,19998,Vistara,UK-832,Chennai,Early_Morning,two_or_more,Evening,Bangalore,Economy,35.33,17,14822
19998,19999,Vistara,UK-996,Mumbai,Evening,one,Morning,Bangalore,Economy,16.33,21,6450


In [3]:
# test data
test = pd.read_csv('datasets/test_data.csv')
test

Unnamed: 0,id,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
0,1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
1,2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
2,3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.00,30
3,4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
4,5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35
...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Air_India,AI-768,Kolkata,Afternoon,one,Morning,Bangalore,Business,17.42,15
4996,4997,Indigo,6E-6214,Kolkata,Morning,zero,Afternoon,Mumbai,Economy,3.00,40
4997,4998,Air_India,AI-402,Kolkata,Morning,one,Night,Mumbai,Business,11.17,37
4998,4999,Air_India,AI-673,Mumbai,Early_Morning,one,Night,Hyderabad,Business,13.33,38


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                5000 non-null   int64  
 1   airline           5000 non-null   object 
 2   flight            5000 non-null   object 
 3   source_city       5000 non-null   object 
 4   departure_time    5000 non-null   object 
 5   stops             5000 non-null   object 
 6   arrival_time      5000 non-null   object 
 7   destination_city  5000 non-null   object 
 8   class             5000 non-null   object 
 9   duration          5000 non-null   float64
 10  days_left         5000 non-null   int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 429.8+ KB


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                20000 non-null  int64  
 1   airline           20000 non-null  object 
 2   flight            20000 non-null  object 
 3   source_city       20000 non-null  object 
 4   departure_time    20000 non-null  object 
 5   stops             20000 non-null  object 
 6   arrival_time      20000 non-null  object 
 7   destination_city  20000 non-null  object 
 8   class             20000 non-null  object 
 9   duration          20000 non-null  float64
 10  days_left         20000 non-null  int64  
 11  price             20000 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 1.8+ MB


In [6]:
# tarain datasetsga ajratoldi
x = train.drop(['price','flight', 'id'], axis=1)
y = train['price']
test = test.drop(['id', 'flight'], axis=1)

In [7]:
# nomerlilarni ajratib olish
x_nomer = x[['duration','days_left']]
# maxsimum va minimum qiyamtlarni korib olish
x_nomer.describe()

Unnamed: 0,duration,days_left
count,20000.0,20000.0
mean,12.177627,25.92415
std,7.157944,13.624874
min,0.83,1.0
25%,6.83,14.0
50%,11.25,26.0
75%,16.08,38.0
max,38.58,49.0


In [8]:
# barcha malumotlar to`liq
x.isnull().sum()


airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
dtype: int64

In [9]:
# barcha malumotlar to`liq
test.isnull().sum()

airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
dtype: int64

In [10]:
y.isnull().sum()

0

In [11]:
x.columns==test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True])

In [12]:
train[['class']].value_counts()


class   
Economy     13753
Business     6247
Name: count, dtype: int64

In [13]:
# piplinega qoshimcha
nomer_pipeline = Pipeline([
          ('std_scaler', StandardScaler())
])
num_attribs = list(['duration','days_left'])
yozuv_attribs = ['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class']

barcha_pipeline = ColumnTransformer([
    ('nomer', nomer_pipeline, num_attribs),
    ('yozuv', OneHotEncoder(), yozuv_attribs)
])

In [14]:
train_malumot = barcha_pipeline.fit_transform(x)
train_malumot.toarray()

array([[ 0.28952803, -0.36141789,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [-1.45682723, -1.38897606,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.36291029, -1.53577008,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 3.23458153, -0.65500594,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.58012154, -0.36141789,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.84836171,  0.00556716,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [15]:
# model qurish
Chiziqli_model = LinearRegression()

In [16]:
# train (o`qitish) chiqli model asosida o`qitildi
Chiziqli_model.fit(train_malumot, y)

In [17]:
train_natija = Chiziqli_model.predict(train_malumot)
train_natija

array([11621.40478192,  2904.46813085, 56748.52233163, ...,
       15387.72932926, 10739.99667029, 11501.82748322])

### test setni tayorlash

In [18]:
# malumotlarni mlga tayorlab olindi
test_malumot = barcha_pipeline.fit_transform(test)
test_malumot.toarray()

array([[ 2.17930407, -1.75411081,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.20548051,  0.58318436,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [-1.41382134,  0.29102246,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.15862287,  0.80230578,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.13704002,  0.87534626,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.51346269,  1.67879147,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

### test data orqali bashorat qilish

In [19]:
test_natija = Chiziqli_model.predict(test_malumot)
test_natija

array([55922.58715278, 53522.52296203, 40657.09027857, ...,
       51487.29495222, 48609.63314027, 51822.9332996 ])

In [20]:
RF_model = RandomForestRegressor()
RF_model.fit(train_malumot, y)

KeyboardInterrupt: 

In [None]:
rf_test_natija = RF_model.predict(test_malumot)

In [None]:
rf_test_natija

array([51957.75, 53762.78, 22686.86, ..., 50135.97, 48127.15, 66410.43])

In [None]:
import pickle

filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [None]:
with open(filename, 'rb') as file:
    model = pickle.load(file)

In [None]:
smple = pd.read_csv('datasets/sample_solution.csv', index_col=0)
smple['price'] = rf_test_natija
# smple['real'] = y
smple

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,51957.75
2,53762.78
3,22686.86
4,2738.58
5,5753.66
...,...
4996,59501.22
4997,4456.02
4998,50135.97
4999,48127.15


In [None]:
smple.to_csv('avia_sample.csv')