In [1]:
#General Imports
import pandas as pd
import numpy as np
import datetime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import calendar
import os
import logging

#Importing test modules
from sklearn.model_selection import train_test_split


#Machine learning imports
from sklearn import ensemble
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

#Processes Data
from sklearn import preprocessing

#Set logging presets
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

In [2]:
os.getcwd()
os.chdir('/data/workspace_files/Data/')

amazon = pd.read_csv('amazon.csv')
#sns.countplot(x="rating",data =amazon)


#amazon.plot(ind='barh',color='blue',figsize(10,5))
price = amazon["rating"].value_counts()
figure(figsize=(10,6),dpi=100)
price.plot(kind='barh',color='darkgray')


plt.xlabel("frequency")
plt.ylabel("rating")
plt.title("Product Rating vs frequency")

In [2]:
#AMAZON Dataset

#Set path correctly
os.getcwd()
os.chdir('/data/workspace_files/Data/')


logging.info('Starting to read data')

#Data Preperation
data = pd.read_csv('amazon.csv',thousands=',')
data.describe()
label_encoder = preprocessing.LabelEncoder()


logging.info('Cleaning data')
df = pd.DataFrame(data)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

logging.info("Converting data to program-readable format")
#Filtering data column into machine-readable format
data['product_id'] = label_encoder.fit_transform(data['product_id'])
data['product_name'] = label_encoder.fit_transform(data['product_name'])
data['category'] = label_encoder.fit_transform(data['category'])
data['discounted_price'] = label_encoder.fit_transform(data['discounted_price'])
data['actual_price'] = label_encoder.fit_transform(data['actual_price'])
data['discount_percentage'] = label_encoder.fit_transform(data['discount_percentage'])
data['rating'] = label_encoder.fit_transform(data['rating'])
data['rating_count'] = label_encoder.fit_transform(data['rating_count'])
data['about_product'] = label_encoder.fit_transform(data['about_product'])
data['user_id'] = label_encoder.fit_transform(data['user_id'])
data['user_name'] = label_encoder.fit_transform(data['user_name'])
data['review_id'] = label_encoder.fit_transform(data['review_id'])
data['review_title'] = label_encoder.fit_transform(data['review_title'])
data['review_content'] = label_encoder.fit_transform(data['review_content'])
data['img_link'] = label_encoder.fit_transform(data['img_link'])
data['product_link'] = label_encoder.fit_transform(data['product_link'])

#Choosing what the model will predict
X = data.drop('actual_price',axis = 1)
y = data['actual_price']


logging.info("Starting machine learning\n")
#Split data set into testing vs training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

#Starting Machine learning!


logging.info("GBM Starting...")
gbm = ensemble.GradientBoostingRegressor(n_estimators = 500, max_depth=7) 
gbm.fit(X_train,y_train)
logging.info(str(gbm.score(X_test,y_test)*100)+" percent accuracy\n")
#depth of 8 and 500 estimators is the best with %72 to %73 score rating

logging.info("Linear Regression Starting...")
lr_model = LinearRegression()
lr_model = lr_model.fit(X_train,y_train)
logging.info(str(lr_model.score(X_test,y_test)*100)+" percent accuracy\n")

logging.info("Random Forest Starting...")
r_forest = RandomForestRegressor(n_estimators = 20, random_state = 0)
r_forest.fit(X_train,y_train)
logging.info(str(r_forest.score(X_test,y_test)*100)+" percent accuracy\n")

logging.info("Decision Tree Starting...")
dt_model = DecisionTreeRegressor(random_state=0, max_depth=10)
dt_model.fit(X_train,y_train)
logging.info(str(dt_model.score(X_test,y_test)*100)+" percent accuracy\n")

2023-03-13 21:52:12,284 - Starting to read data
2023-03-13 21:52:12,584 - Cleaning data
2023-03-13 21:52:12,598 - Converting data to program-readable format
2023-03-13 21:52:12,649 - Starting machine learning

2023-03-13 21:52:12,656 - GBM Starting...
2023-03-13 21:52:18,051 - 72.2979905799176 percent accuracy

2023-03-13 21:52:18,053 - Linear Regression Starting...
2023-03-13 21:52:18,104 - -4.862901657652974 percent accuracy

2023-03-13 21:52:18,105 - Random Forest Starting...
2023-03-13 21:52:18,370 - 70.0221014928446 percent accuracy

2023-03-13 21:52:18,372 - Decision Tree Starting...
2023-03-13 21:52:18,389 - 62.275164642120686 percent accuracy



In [2]:
#AIRLINE CODE

os.getcwd()
os.chdir('/data/workspace_files/Data/')

#Prediction model that predicts stuff
from sklearn import preprocessing

print("Starting to read file")
data = pd.read_csv('flights.csv')

print("Starting the describe process")
data.describe()
label_encoder = preprocessing.LabelEncoder()
#print(data.isnull().sum())



data = pd.DataFrame(data)

#print("replacing infinite stuff")
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

print("Converting columns of data")
data['FlightDate']= label_encoder.fit_transform(data['FlightDate'])
data['Airline']= label_encoder.fit_transform(data['Airline'])
data['Origin']= label_encoder.fit_transform(data['Origin'])
data['Dest']= label_encoder.fit_transform(data['Dest'])
data['Cancelled']= label_encoder.fit_transform(data['Cancelled'])
data['Diverted']= label_encoder.fit_transform(data['Diverted'])
data['CRSDepTime']= label_encoder.fit_transform(data['CRSDepTime'])
data['DepTime']= label_encoder.fit_transform(data['DepTime'])
data['DepDelayMinutes']= label_encoder.fit_transform(data['DepDelayMinutes'])
data['DepDelay']= label_encoder.fit_transform(data['DepDelay'])
data['ArrTime']= label_encoder.fit_transform(data['ArrTime'])
data['ArrDelayMinutes']= label_encoder.fit_transform(data['ArrDelayMinutes'])
data['AirTime']= label_encoder.fit_transform(data['AirTime'])
data['CRSElapsedTime']= label_encoder.fit_transform(data['CRSElapsedTime'])
data['ActualElapsedTime']= label_encoder.fit_transform(data['ActualElapsedTime'])
data['Distance']= label_encoder.fit_transform(data['Distance'])
data['Year']= label_encoder.fit_transform(data['Year'])
data['Quarter']= label_encoder.fit_transform(data['Quarter'])
data['Month']= label_encoder.fit_transform(data['Month'])
data['DayofMonth']= label_encoder.fit_transform(data['DayofMonth'])
data['DayOfWeek']= label_encoder.fit_transform(data['DayOfWeek'])
data['Marketing_Airline_Network']= label_encoder.fit_transform(data['Marketing_Airline_Network'])
data['Operated_or_Branded_Code_Share_Partners']= label_encoder.fit_transform(data['Operated_or_Branded_Code_Share_Partners'])
data['DOT_ID_Marketing_Airline']= label_encoder.fit_transform(data['DOT_ID_Marketing_Airline'])
data['IATA_Code_Marketing_Airline']= label_encoder.fit_transform(data['IATA_Code_Marketing_Airline'])
data['Flight_Number_Marketing_Airline']= label_encoder.fit_transform(data['Flight_Number_Marketing_Airline'])
data['Operating_Airline']= label_encoder.fit_transform(data['Operating_Airline'])
data['DOT_ID_Operating_Airline']= label_encoder.fit_transform(data['DOT_ID_Operating_Airline'])
data['IATA_Code_Operating_Airline']= label_encoder.fit_transform(data['IATA_Code_Operating_Airline'])
data['Tail_Number']= label_encoder.fit_transform(data['Tail_Number'])
data['Flight_Number_Operating_Airline']= label_encoder.fit_transform(data['Flight_Number_Operating_Airline'])
data['OriginAirportID']= label_encoder.fit_transform(data['OriginAirportID'])
data['OriginAirportSeqID']= label_encoder.fit_transform(data['OriginAirportSeqID'])
data['OriginCityMarketID']= label_encoder.fit_transform(data['OriginCityMarketID'])
data['OriginCityName']= label_encoder.fit_transform(data['OriginCityName'])
data['OriginState']= label_encoder.fit_transform(data['OriginState'])
data['OriginStateFips']= label_encoder.fit_transform(data['OriginStateFips'])
data['OriginStateName']= label_encoder.fit_transform(data['OriginStateName'])
data['OriginWac']= label_encoder.fit_transform(data['OriginWac'])
data['DestAirportID']= label_encoder.fit_transform(data['DestAirportID'])
data['DestAirportSeqID']= label_encoder.fit_transform(data['DestAirportSeqID'])
data['DestCityName']= label_encoder.fit_transform(data['DestCityName'])
data['DestState']= label_encoder.fit_transform(data['DestState'])
data['DestStateFips']= label_encoder.fit_transform(data['DestStateFips'])
data['DestStateName']= label_encoder.fit_transform(data['DestStateName'])
data['DestWac']= label_encoder.fit_transform(data['DestWac'])
data['DepDel15']= label_encoder.fit_transform(data['DepDel15'])
data['DepartureDelayGroups']= label_encoder.fit_transform(data['DepartureDelayGroups'])
data['DepTimeBlk']= label_encoder.fit_transform(data['DepTimeBlk'])
data['TaxiOut']= label_encoder.fit_transform(data['TaxiOut'])
data['WheelsOff']= label_encoder.fit_transform(data['WheelsOff'])
data['WheelsOn']= label_encoder.fit_transform(data['WheelsOn'])
data['TaxiIn']= label_encoder.fit_transform(data['TaxiIn'])
data['CRSArrTime']= label_encoder.fit_transform(data['CRSArrTime'])
data['ArrDelay']= label_encoder.fit_transform(data['ArrDelay'])
data['ArrDel15']= label_encoder.fit_transform(data['ArrDel15'])
data['ArrivalDelayGroups']= label_encoder.fit_transform(data['ArrivalDelayGroups'])
data['ArrTimeBlk']= label_encoder.fit_transform(data['ArrTimeBlk'])
data['DistanceGroup']= label_encoder.fit_transform(data['DistanceGroup'])
data['DivAirportLandings']= label_encoder.fit_transform(data['DivAirportLandings'])

print("Done encoding Data that has text")





print("Dropping columns")
X = data.drop('DepDelayMinutes',axis =1)
data.drop('DepDelay',axis=1)
data.drop('CRSDepTime',axis=1)
y = data['DepDelayMinutes']


#Split data set into testing vs training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)


#Starting Machine learning!
print("Linear Regression")
lr_model = LinearRegression()
lr_model = lr_model.fit(X_train,y_train)
print(str(lr_model.score(X_test,y_test)*100)+" percent accuracy\n")

print("Random Forest")
r_forest = RandomForestRegressor(n_estimators = 20, random_state = 0)
r_forest.fit(X_train,y_train)
print(str(r_forest.score(X_test,y_test)*100)+" percent accuracy\n")

print("Decision Tree")
dt_model = DecisionTreeRegressor(random_state=0, max_depth=10)
dt_model.fit(X_train,y_train)
print(str(dt_model.score(X_test,y_test)*100)+" percent accuracy\n")


print("GBM")
gbm = ensemble.GradientBoostingRegressor(n_estimators = 50, max_depth=5) 
gbm.fit(X_train,y_train)
print(str(gbm.score(X_test,y_test)*100)+" percent accuracy\n")


In [None]:
#Extra Code that we don't currently need:

#Another one I could do is
#DepDelayMinutes
#and DepDelay

#print("rid of infinite pt 2")
#assert isinstance(data, pd.DataFrame)
#data.dropna(inplace=True)
#indices_to_keep = ~data.isin([np.nan, np.inf, -np.inf]).any(axis=1)
#data = data[indices_to_keep].astype(np.float64)


#df = pd.DataFrame(data)
#df.replace([np.inf, -np.inf], np.nan, inplace=True)
#data.dropna(inplace=True)
#X = X.values.astype(float)
#y = y.values.astype(float)
#data.replace([np.inf, -np.inf], np.nan, inplace=True)
##y_test = np.nan_to_num(y_test)
#data.fillna(999, inplace=True)

#print(data.isnull().sum())
#21 columns max

In [5]:
#UNIVERSAL CODE

#Set path correctly
os.getcwd()
os.chdir('/data/workspace_files/Data/')

print(os.listdir())
fileName = input("Please enter your file of choosing in this format from the list above: 'file.csv' \n")


logging.info('Starting to read data')

#Data Preperation
data = pd.read_csv(fileName,thousands=',')
data.describe()
label_encoder = preprocessing.LabelEncoder()


logging.info('Cleaning data')
df = pd.DataFrame(data)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)


#Filtering data column into machine-readable format
cols = len(df.axes[1])
z=cols
col = list(data.columns.values)
logging.info("Converting data to program-readable format")
for x in range(z):
    data[col[x]] = label_encoder.fit_transform(data[col[x]])


print(df.axes[1])

columnPredict = input("Please enter your desired datapoint to predict using the format above: 'actual_price' ")
#Choosing what the model will predict
X = data.drop(columnPredict,axis = 1)
y = data[columnPredict]
#amazon data set: "actual_price"


logging.info("Starting machine learning\n")
#Split data set into testing vs training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

#Starting Machine learning!


logging.info("GBM Starting...")
gbm = ensemble.GradientBoostingRegressor(n_estimators = 300, max_depth=7) 
gbm.fit(X_train,y_train)
logging.info(str(gbm.score(X_test,y_test)*100)+" percent accuracy\n")
#depth of 8 and 500 estimators is the best with %72 to %73 score rating

logging.info("Linear Regression Starting...")
lr_model = LinearRegression()
lr_model = lr_model.fit(X_train,y_train)
logging.info(str(lr_model.score(X_test,y_test)*100)+" percent accuracy\n")

logging.info("Random Forest Starting...")
r_forest = RandomForestRegressor(n_estimators = 20, random_state = 0)
r_forest.fit(X_train,y_train)
logging.info(str(r_forest.score(X_test,y_test)*100)+" percent accuracy\n")

logging.info("Decision Tree Starting...")
dt_model = DecisionTreeRegressor(random_state=0, max_depth=10)
dt_model.fit(X_train,y_train)
logging.info(str(dt_model.score(X_test,y_test)*100)+" percent accuracy\n")

['amazon.csv', 'flights_2019.csv', 'Combined_Flights_2019.csv.zip', 'flights.csv', 'walmart.csv']
Please enter your file of choosing in this format from the list above: 'file.csv' 
 walmart.csv
Index(['Uniq Id', 'Crawl Timestamp', 'Pageurl', 'Website', 'Title',
       'Num Of Reviews', 'Average Rating', 'Number Of Ratings', 'Model Num',
       'Sku', 'Upc', 'Manufacturer', 'Model Name', 'Price', 'Monthly Price',
       'Stock', 'Carrier', 'Color Category', 'Internal Memory', 'Screen Size',
       'Specifications', 'Five Star', 'Four Star', 'Three Star', 'Two Star',
       'One Star', 'Discontinued', 'Broken Link', 'Joining Key'],
      dtype='object')
Please enter your desired datapoint to predict using the format above: 'actual_price'  Price


2023-03-23 00:01:42 INFO     Starting to read data
2023-03-23 00:01:43 INFO     Cleaning data
2023-03-23 00:01:43 INFO     Converting data to program-readable format
2023-03-23 00:01:49 INFO     Starting machine learning

2023-03-23 00:01:49 INFO     GBM Starting...
2023-03-23 00:02:48 INFO     48.76451089663403 percent accuracy

2023-03-23 00:02:48 INFO     Linear Regression Starting...
2023-03-23 00:02:48 INFO     7.555678985371628 percent accuracy

2023-03-23 00:02:48 INFO     Random Forest Starting...
2023-03-23 00:02:54 INFO     43.795364175591665 percent accuracy

2023-03-23 00:02:54 INFO     Decision Tree Starting...
2023-03-23 00:02:54 INFO     26.021690494669713 percent accuracy



In [3]:
#UNIVERSAL CODE - Tarang

#Set path correctly
os.getcwd()
os.chdir('/data/workspace_files/Data/')

print(os.listdir())
fileName = input("Please enter your file of choosing in this format from the list above: 'file.csv' \n")


logging.info('Starting to read data')

#Data Preperation
data = pd.read_csv(fileName,thousands=',')
data.describe()
label_encoder = preprocessing.LabelEncoder()


logging.info('Cleaning data')
df = pd.DataFrame(data)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)


#Filtering data column into machine-readable format
cols = len(df.axes[1])
z=cols
col = list(data.columns.values)
logging.info("Converting data to program-readable format")
for x in range(z):
    data[col[x]] = label_encoder.fit_transform(data[col[x]])


print(df.axes[1])

columnPredict = input("Please enter your desired datapoint to predict using the format above: 'actual_price' ")
#Choosing what the model will predict
X = data.drop(columnPredict,axis = 1)
y = data[columnPredict]
#amazon data set: "actual_price"


logging.info("Starting machine learning\n")
#Split data set into testing vs training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

#Starting Machine learning!


logging.info("GBM Starting...")
gbm = ensemble.GradientBoostingRegressor(n_estimators = 300, max_depth=7) 
gbm.fit(X_train,y_train)
logging.info(str(gbm.score(X_test,y_test)*100)+" percent accuracy\n")
#depth of 8 and 500 estimators is the best with %72 to %73 score rating

logging.info("Linear Regression Starting...")
lr_model = LinearRegression()
lr_model = lr_model.fit(X_train,y_train)
logging.info(str(lr_model.score(X_test,y_test)*100)+" percent accuracy\n")

logging.info("Random Forest Starting...")
r_forest = RandomForestRegressor(n_estimators = 20, random_state = 0)
r_forest.fit(X_train,y_train)
logging.info(str(r_forest.score(X_test,y_test)*100)+" percent accuracy\n")

logging.info("Decision Tree Starting...")
dt_model = DecisionTreeRegressor(random_state=0, max_depth=10)
dt_model.fit(X_train,y_train)
logging.info(str(dt_model.score(X_test,y_test)*100)+" percent accuracy\n")

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
y_pred
from sklearn.metrics import accuracy_score
print("Naive Bayes Starting...")
print('Model accuracy score: {0:0.4f}'. format((accuracy_score(y_test, y_pred))*100))
y_pred_train = gnb.predict(X_train)
y_pred_train
print('Training-set accuracy score: {0:0.4f}'. format((accuracy_score(y_train, y_pred_train))*100))
print('Training set score: {:.4f}'.format((gnb.score(X_train, y_train))*100))
print('Test set score: {:.4f}'.format((gnb.score(X_test, y_test))*100))
y_test.value_counts()
null_accuracy = (7407/(7407+2362))
print('Null accuracy score: {:0.4f}'. format((null_accuracy)*100))

['amazon.csv', 'flights_2019.csv', 'Combined_Flights_2019.csv.zip', 'flights.csv', 'walmart.csv']
Please enter your file of choosing in this format from the list above: 'file.csv' 
 amazon.csv
Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')
Please enter your desired datapoint to predict using the format above: 'actual_price'  actual_price
Naive Bayes Starting...
Model accuracy score: 16.0410
Training-set accuracy score: 64.6758
Training set score: 64.6758
Test set score: 16.0410
Null accuracy score: 75.8215


2023-03-22 23:58:16 INFO     Starting to read data
2023-03-22 23:58:16 INFO     Cleaning data
2023-03-22 23:58:16 INFO     Converting data to program-readable format
2023-03-22 23:58:30 INFO     Starting machine learning

2023-03-22 23:58:30 INFO     GBM Starting...
2023-03-22 23:58:33 INFO     73.18187416622499 percent accuracy

2023-03-22 23:58:33 INFO     Linear Regression Starting...
2023-03-22 23:58:33 INFO     -4.862901657652974 percent accuracy

2023-03-22 23:58:33 INFO     Random Forest Starting...
2023-03-22 23:58:33 INFO     70.0221014928446 percent accuracy

2023-03-22 23:58:33 INFO     Decision Tree Starting...
2023-03-22 23:58:33 INFO     62.275164642120686 percent accuracy

