In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
econmy=pd.read_csv('/content/economy.csv')

In [3]:
bsness=pd.read_csv('/content/business.csv')

In [4]:
df = pd.concat([econmy,bsness])

In [5]:
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,11-02-2022,SpiceJet,SG,8709,18:55,Delhi,02h 10m,non-stop,21:05,Mumbai,5953
1,11-02-2022,SpiceJet,SG,8157,06:20,Delhi,02h 20m,non-stop,08:40,Mumbai,5953
2,11-02-2022,AirAsia,I5,764,04:25,Delhi,02h 10m,non-stop,06:35,Mumbai,5956
3,11-02-2022,Vistara,UK,995,10:20,Delhi,02h 15m,non-stop,12:35,Mumbai,5955
4,11-02-2022,Vistara,UK,963,08:50,Delhi,02h 20m,non-stop,11:10,Mumbai,5955


In [6]:
df.shape

(300261, 11)

In [7]:
df.dtypes

date          object
airline       object
ch_code       object
num_code       int64
dep_time      object
from          object
time_taken    object
stop          object
arr_time      object
to            object
price         object
dtype: object

In [8]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')

In [9]:
df['dep_time'] = pd.to_datetime(df['dep_time'], format='%H:%M')
df['arr_time'] = pd.to_datetime(df['arr_time'], format='%H:%M')

In [10]:
df['price'] = df['price'].str.replace(',', '')

In [11]:
df['price'] = pd.to_numeric(df['price'])

In [12]:
df.describe()

Unnamed: 0,num_code,price
count,300261.0,300261.0
mean,1417.771709,20883.717666
std,1974.514439,22695.911266
min,101.0,1105.0
25%,637.0,4783.0
50%,818.0,7425.0
75%,927.0,42521.0
max,9991.0,123071.0


In [13]:
df.dtypes

date          datetime64[ns]
airline               object
ch_code               object
num_code               int64
dep_time      datetime64[ns]
from                  object
time_taken            object
stop                  object
arr_time      datetime64[ns]
to                    object
price                  int64
dtype: object

In [14]:
df.isnull().sum()

date          0
airline       0
ch_code       0
num_code      0
dep_time      0
from          0
time_taken    0
stop          0
arr_time      0
to            0
price         0
dtype: int64

In [15]:
objcol = df.select_dtypes(include = "object").columns

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for feat in objcol:
    df[feat] = le.fit_transform(df[feat])

In [17]:
df.dtypes

date          datetime64[ns]
airline                int64
ch_code                int64
num_code               int64
dep_time      datetime64[ns]
from                   int64
time_taken             int64
stop                   int64
arr_time      datetime64[ns]
to                     int64
price                  int64
dtype: object

In [18]:
df.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price
0,2022-02-11,4,6,8709,1900-01-01 18:55:00,2,16,39,1900-01-01 21:05:00,5,5953
1,2022-02-11,4,6,8157,1900-01-01 06:20:00,2,18,39,1900-01-01 08:40:00,5,5953
2,2022-02-11,1,4,764,1900-01-01 04:25:00,2,16,39,1900-01-01 06:35:00,5,5956
3,2022-02-11,7,7,995,1900-01-01 10:20:00,2,17,39,1900-01-01 12:35:00,5,5955
4,2022-02-11,7,7,963,1900-01-01 08:50:00,2,18,39,1900-01-01 11:10:00,5,5955


In [19]:
df=df.drop(['date','dep_time','arr_time'],axis=1)

In [20]:
import pandas as pd
import numpy as np

# Function to calculate entropy
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy


In [21]:
# Function to calculate information gain
def information_gain(data, feature, target):
    # Calculate parent entropy
    parent_entropy = entropy(data[target])
    
    # Calculate the weighted entropy of each child
    vals, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[feature] == vals[i]).dropna()[target]) for i in range(len(vals))])
    
    # Calculate information gain
    information_gain = parent_entropy - weighted_entropy
    return information_gain

In [22]:
# Function to build decision tree
def build_tree(data, target, features, max_depth):
    # Select the best feature to split the data
    info_gains = [information_gain(data, feature, target) for feature in features]
    best_feature_index = np.argmax(info_gains)
    best_feature = features[best_feature_index]
    
    # Create a node for the decision tree
    tree = {}
    tree['feature'] = best_feature
    
    # Remove the best feature from the list of features
    features = [f for f in features if f != best_feature]
    
    # Recursively build the decision tree
    for value in np.unique(data[best_feature]):
        sub_data = data.where(data[best_feature] == value).dropna()
        sub_tree = {}
        
        # Check for stopping criteria
        if len(sub_data[target].unique()) == 1:
            sub_tree['value'] = sub_data[target].unique()[0]
        elif len(features) == 0 or max_depth == 1:
            sub_tree['value'] = sub_data[target].mode()[0]
        else:
            sub_tree = build_tree(sub_data, target, features, max_depth-1)
        
        tree[value] = sub_tree
        
    return tree

In [23]:
# Function to predict using decision tree
def predict(tree, sample):
    if isinstance(tree, int):
        return tree
    for key in tree.keys():
        if key == 'value':
            return tree['value']
        else:
            value = sample[tree['feature']]
            if str(value) in tree[str(key)]:
                subtree = tree[str(key)][str(value)]
                return predict(subtree, sample)



In [24]:
def predict(tree, sample):
    if isinstance(tree, int):
        return tree
    for key in tree.keys():
        if key == 'value':
            return tree['value']
        else:
            value = sample[tree['feature']]
            if str(value) in tree[key]:
                subtree = tree[key][value]
                return predict(subtree, sample)


In [25]:
# Function to build a random forest regressor
def random_forest_regressor(data, target, num_trees, max_depth):
    trees = []
    for i in range(num_trees):
        # Create a bootstrap sample of the data
        bootstrap_sample = data.sample(frac=1, replace=True)
        
        # Select a random subset of features
        features = bootstrap_sample.columns.tolist()
        features.remove(target)
        num_features = len(features)
        num_features_selected = int(np.sqrt(num_features))
        feature_indices = np.random.choice(num_features, size=num_features_selected, replace=False)
        selected_features = [features[index] for index in feature_indices]
        
        # Build a decision tree using the bootstrap sample and selected features
        tree = build_tree(bootstrap_sample, target, selected_features, max_depth)
        
        # Add the decision tree to the list of trees
        trees.append(tree)
    
    # Return the list of trees
    return trees


In [26]:
def predict_rf(trees, sample):
    predictions = []
    for tree in trees:
        prediction = predict(tree, sample)
        if prediction is not None:
            predictions.append(prediction)
    return np.mean(predictions)



In [None]:
target = 'price'
max_depth = 5
num_trees = 10
trees = random_forest_regressor(df, target, num_trees, max_depth)
sample = df.iloc[0]
prediction = predict_rf(sample, trees)
print('Prediction:', prediction)
samples = df.drop(columns=target)
labels = df[target]
predictions = [predict_rf(trees, sample) for _, sample in samples.iterrows()]
print(predictions)
mse = np.mean((predictions - labels) ** 2)
print('Mean squared error:', mse)

Prediction: 1841.75


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
