# Project Code

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor        
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)
import re
import nltk
from googletrans import Translator

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob
from spellchecker import SpellChecker

### Data Preprocessing - Krishna

In [2]:
# Load Data
data = pd.read_csv('Airbnb_Open_Data.csv',low_memory=False)
data.head()

Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,country code,instant_bookable,cancellation_policy,room type,Construction year,price,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,US,False,strict,Private room,2020.0,$966,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,US,False,moderate,Entire home/apt,2007.0,$142,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,US,True,flexible,Private room,2005.0,$620,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,US,True,moderate,Entire home/apt,2005.0,$368,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,US,False,moderate,Entire home/apt,2009.0,$204,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [3]:
#Harshitha
#Delete-reducing the size of the dataset for testing

print(data.shape)
#Number of rows to drop
droprow=int(len(data)*90/100)
#Dropping the rows
data=data.iloc[:-droprow]

print(data.shape)

(102599, 26)
(10260, 26)


In [4]:
#Extracting features from "house rules"

data['house_rules']=data['house_rules'].fillna('')
data['house_rules']=data['house_rules'].astype(str)


#Translating chinese reviews into english
def translatechinese(text):   
    
    #Creating a function that checks if there are any Chinese house rules
    def containschinese(text):
        if text is None or text.strip=='':
            return False
        else:
            match = bool(re.search('[\u4e00-\u9fff]', text))
            return match
    
    if containschinese(text):
        translator=Translator()
        translated= translator.translate(text,src='zh-cn', dest='en')
        return translated.text
    
    else:
        return text
    
    

data["translated"]=data['house_rules'].apply(lambda x: translatechinese(x))
text_all=data["translated"]


In [5]:
#Cleaning up translated text in house rules

import wordninja

#Converting text into lower case
text=' '.join(list(map(str.lower,text_all)))

#Removing tags
text=re.sub('<.*>.','',text)

#Removing punctuation
text=re.sub(r'[^\w\s]','',text)

#Removing numbers
text=re.sub(r'\d+','',text)

text=set(text.split(' '))


In [6]:
# Delete this code

df=pd.DataFrame(set(text))
df.to_excel("dataframe.xlsx",sheet_name="firstsheet",index=False)

text=' '.join(text)

In [7]:
#Running spell check on the words 

import nltk
from nltk.stem import SnowballStemmer

spell=SpellChecker()
text_list=[]
for word in re.split(r'\s+',text.strip()):

    if spell.correction(word)==None:
        if wordninja.split(word):

            text_list.append(' '.join(wordninja.split(word)))
    else:

        text_list.append(word)          
        text=' '.join(text_list)
        
        

#Removing stopwords
stop_words=set(stopwords.words('english'))
text=[word for word in text.split(' ') if word not in stop_words]
text=' '.join(text)


#Can delete this portion of the code
df=pd.DataFrame(text.split(' '))
df.to_excel("dataframewords.xlsx",sheet_name="firstsheet",index=False)

#Stemming - not producing good results


stemmer=SnowballStemmer('english')
text=([stemmer.stem(word) for word in text.split(' ')])

df=pd.DataFrame(text)
df.to_excel("dataframewordsafterstem.xlsx",sheet_name="firstsheet",index=False)

text=' '.join(text)

In [8]:
finaltext= set(text.split(' '))
print(len(finaltext))

#Printing features to excel(Can delete)
df=pd.DataFrame(finaltext)
df.to_excel("finaltext.xlsx",sheet_name="firstsheet",index=False)

4079


In [9]:

#Can delete this portion of the code
#Tokenizing the words-Using sklearn
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(vocabulary=finaltext)

X=vectorizer.transform(data["translated"])
bow_array=X.toarray()
feature_names=vectorizer.get_feature_names_out()

#Creating an dataframe with the new features created
extracted_features=pd.DataFrame(bow_array,columns=feature_names)


In [10]:
# Standardize Column Names
data.columns = [col.lower().replace(' ', '_') if len(col.split()) >= 2 else col.lower() for col in data.columns]

# Drop Irrelevant Columns
irrelevant_columns = ['id', 'name', 'host_id', 'host_name', 'license', 'house_rules', 
                      'country', 'country_code', 'lat', 'long', 'service_fee']
data_cleaned = data.drop(columns=irrelevant_columns)

# Clean Price and Service Fee Data
data_cleaned['price'] = data_cleaned['price'].str.replace('[^\d.]', '', regex=True).astype(float)

# Store Price seperately
response = data_cleaned['price']
data_cleaned = data_cleaned.drop(['price'], axis=1)
response = response.interpolate(method='linear')

# Derive days_since_last_review from last_review 
data_cleaned['last_review'] = pd.to_datetime(data_cleaned['last_review'], errors='coerce')
reference_date = datetime.now()
data_cleaned['days_since_last_review'] = (reference_date - data_cleaned['last_review']).dt.days
data_cleaned['days_since_last_review'].fillna(9999, inplace=True)
data_cleaned.drop(columns=['last_review'], inplace=True)

# Derive years_since_construction from construction_year
current_year = datetime.now().year
data_cleaned['years_since_construction'] = current_year - data_cleaned['construction_year']
data_cleaned.drop(columns=['construction_year'], inplace=True)

# Clean neighbourhood_group data
correct_mapping = {'brookln': 'Brooklyn','manhatan': 'Manhattan'}
data_cleaned['neighbourhood_group'] = data_cleaned['neighbourhood_group'].replace(correct_mapping)

# Impute Numerical Missing Data Using Linear Interpolation
numeric_columns = data_cleaned.select_dtypes(include=['float64']).columns
data_cleaned[numeric_columns] = data_cleaned[numeric_columns].apply(lambda col: col.interpolate(method='linear'))

# # Scale Numeric Data
# scaler = StandardScaler()
# data_cleaned[numeric_columns] = scaler.fit_transform(data_cleaned[numeric_columns])
# data_cleaned[numeric_columns] = scaler.transform(data_cleaned[numeric_columns])

# Impute Categorical Missing Data Using Mode Imputation
categorical_columns = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].astype('category')
data_cleaned[categorical_columns] = data_cleaned[categorical_columns].apply(lambda col: col.fillna(col.mode()[0]))

data_cleaned.head()

Unnamed: 0,host_identity_verified,neighbourhood_group,neighbourhood,instant_bookable,cancellation_policy,room_type,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,translated,days_since_last_review,years_since_construction
0,unconfirmed,Brooklyn,Kensington,False,strict,Private room,10.0,9.0,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,1138.0,4.0
1,verified,Manhattan,Midtown,False,moderate,Entire home/apt,30.0,45.0,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,924.0,17.0
2,unconfirmed,Manhattan,Harlem,True,flexible,Private room,3.0,0.0,2.51,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",9999.0,19.0
3,unconfirmed,Brooklyn,Clinton Hill,True,moderate,Entire home/apt,30.0,270.0,4.64,4.0,1.0,322.0,,1975.0,19.0
4,verified,Manhattan,East Harlem,False,moderate,Entire home/apt,10.0,9.0,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",2203.0,15.0


In [11]:
data_cleaned.dtypes


host_identity_verified            category
neighbourhood_group               category
neighbourhood                     category
instant_bookable                  category
cancellation_policy               category
room_type                         category
minimum_nights                     float64
number_of_reviews                  float64
reviews_per_month                  float64
review_rate_number                 float64
calculated_host_listings_count     float64
availability_365                   float64
translated                        category
days_since_last_review             float64
years_since_construction           float64
dtype: object

In [13]:
#Adding ~4000 features extracted from house_rules to datacleaned
categorical_columns = data_cleaned.select_dtypes(include=['category']).columns
data_cleaned=pd.concat([data_cleaned,extracted_features], axis=1)

#Dropping redundant column 'translated' from the dataset
data_cleaned.drop('translated',axis=1,inplace=True)
print(data_cleaned.shape)


# OneHotEncoding for Categorical Variables for Model Compatibility
data_encoded = pd.get_dummies(data_cleaned, drop_first=True)


(10260, 4093)


##### Notes for Harshita

- data_cleaned: Data with all features without encoding, and response (price). Can be used for EDA and Outlier Detection
- data_encoded: OneHotEncoded data. Not meaningful for outlier detection, EDA, or feature selection. Purely for model inputs. This step needs to be done after EDA, outlier detection, and feature selection is completed.
- I also removed useless features such as 'house_rules', 'country', 'country_code', 'lat', 'long' as country and country code for the entire dataset were United States. I removed lat and long as they are raw coordinates that have no significance without context. I removed house rules as it is pure paragraphical text data which we cannot process for a forecasting task. I removed service fee as it is already included in the price (100% correlation to response).

### EDA - Harshita

## Notes for Krishna

Correrlation matrix is taking forever to run, do you have any tricks to make it faster?


#### Correlation between all the features

In [14]:
### Correlation between all the quantitative variables

import seaborn as sns
import matplotlib.pyplot as plt


correlation_matrix=data_encoded.corr()

### Mutual Information between all the features

In [15]:
### Mutual Information between response variable and the rest of the features

from sklearn.feature_selection import mutual_info_regression
columns=data_encoded.columns

Mutual_Inforamtion=[]


for column in columns:

    mi=mutual_info_regression(data_encoded[[column]],response)
    Mutual_Inforamtion.append(mi[0])
    


### Portion of the output(Can delete)
print(Mutual_Inforamtion[1:5][0])

0.0021227726145554016


### PCA - Harshitha

In [16]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


#Scaling the features
scaler=StandardScaler()
scaled_data=scaler.fit_transform(data_encoded)


#PCA with Standardization
pca_standardized=PCA(n_components=100)
pca_components_standardized=pca_standardized.fit(scaled_data)
print("Percentage of variance explained",pca_standardized.explained_variance_ratio_)


#PCA without standardization

pca_unstandardized=PCA(n_components=100)
pca_components_unstandardized= pca_unstandardized.fit(data_encoded)
print("Percentage of variance explained",pca_unstandardized.explained_variance_ratio_)

Percentage of variance explained [0.0100249  0.00896012 0.00836197 0.00744095 0.00703767 0.00680054
 0.00637863 0.00604605 0.00592185 0.00582162 0.00566516 0.0056394
 0.00543532 0.00530089 0.00513925 0.0050467  0.00491914 0.00485636
 0.00482317 0.00466772 0.00458317 0.00456163 0.00452406 0.00445315
 0.00441909 0.00436705 0.00433595 0.00428937 0.00422588 0.00417551
 0.00415646 0.00406292 0.00400388 0.00393735 0.00387918 0.00378026
 0.00375405 0.00373553 0.00368651 0.00366972 0.00363523 0.00362026
 0.00358275 0.00356836 0.00355191 0.00354484 0.00353287 0.00346552
 0.00344564 0.00342808 0.00342431 0.00340341 0.00338604 0.00337188
 0.00336049 0.00332193 0.00329401 0.00328821 0.00325153 0.0032277
 0.00320274 0.00316892 0.00315412 0.00312781 0.00308713 0.00305936
 0.00302684 0.0030173  0.0029764  0.00294556 0.00294144 0.0029012
 0.00287861 0.00287493 0.00285556 0.00281602 0.00279411 0.0027757
 0.00274838 0.00272034 0.00270155 0.00268771 0.00266977 0.00263926
 0.00263319 0.00260835 0.00258162

In [17]:
#Sum of variance explained

sum_unstandardized=sum(pca_standardized.explained_variance_ratio_)
sum_standardized=sum(pca_unstandardized.explained_variance_ratio_)


print(sum_unstandardized)
print(sum_standardized)

0.3851096555806529
0.9999989682491139


In [18]:
#Extracting the relevant features from PCA

# Store Features in X
X_PCA = pca_standardized.fit_transform(data_encoded)
print(X.shape)
# Store Response Variables in y
y = response
print(y.shape)

# Train Test Split
X_PCA_train, X_PCA_test, y_train, y_test = train_test_split(X_PCA, y, test_size=0.2, random_state=42)


(10260, 4079)
(10260,)


### Outlier Detection - Harshita

In [19]:
#Outlier detection with LocalOutlierFactor


from sklearn.neighbors import LocalOutlierFactor

lof=LocalOutlierFactor(n_neighbors=20)
outliers_lof=lof.fit_predict(data_encoded)



Notes about adjustments needed:

Randomly chose a value of 0.1 for contamination in isolation forest- need to figure out how to determine it.
Also need to figure out how to deal with the outliers i.e. delete them or keep them. 

In [20]:
#Outlier detection with Isolation Forest

from sklearn.ensemble import IsolationForest

param={'contamination':[0.001,0.01,0.05,0.1,0.15],'n_estimators':[50,100,200,500],'max_samples':[0.5,0.7,1],'bootstrap':['True','False']}

IsolationForest_model= IsolationForest(random_state=42)
grid=GridSearchCV(IsolationForest_model,param,cv=5,scoring=silhouette_score_func)
IsolationForest_model.fit(data_encoded)

outliers_Isolaionforest=IsolationForest_model.predict(data_encoded)

### Train Test Split - Krishna

In [24]:
# OneHotEncoding for Categorical Variables for Model Compatibility
#data_encoded = pd.get_dummies(data_cleaned, drop_first=True)

# Store Features in X
X = data_encoded

#Scaling X

from sklearn.preprocessing import StandardScaler


#Scaling the features
scaler=StandardScaler()
X=scaler.fit_transform(X)

# Store Response Variables in y
y = response

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Selection -Harshitha

In [None]:
#Using Lasso Regression to perform Feature Selection

from sklearn.linear_model import LassoCV

model_lasso=LassoCV(cv=5, max_iter=10000).fit(X_train,y_train)
model_lasso.mse_path_

Optimal_lambda=  model_lasso.alpha_


#Extracting selected features
coefficients= model_lasso.coef_
selected_features_indices= np.where(coefficients!=0)[0]




In [None]:
selected_features=data_encoded.columns[selected features]


### Feature Selection - Krishna

In [None]:
# Mutual Information Initialization
mi_scores = mutual_info_regression(X_train, y_train)
mi_scores_series = pd.Series(mi_scores, index=X_train.columns)

# Thresholds to test
thresholds = [0.001, 0.005, 0.01]

# Initialize variables to track the best thresholds and scores
best_mi_t = 0
best_mi_score = 0
best_xgb_t = 0
best_xgb_score = 0
best_features_mi = []
best_features_xgb = []

# Loop through thresholds for both MI and XGBoost feature selection
for t in thresholds:
    print(f'Threshold: {t}')
    print('-' * 30)
    
    # Mutual Information Feature Selection
    mi_selected_features = mi_scores_series[mi_scores_series > t].index
    X_train_mi = X_train[mi_selected_features]
    print(f"MI Features Selected: {len(mi_selected_features)}")

    # Evaluate using cross-validation after MI selection
    mi_scores = cross_val_score(XGBRegressor(n_estimators=200,random_state=42), X_train_mi, y_train, cv=10, scoring='r2')
    mi_mean_score = mi_scores.mean()

    # Update best MI threshold and score and save features
    if mi_mean_score > best_mi_score:
        best_mi_score = mi_mean_score
        best_mi_t = t
        best_features_mi = mi_selected_features.tolist()

    print(f"MI Mean CV Score: {mi_mean_score:.4f}")

    # XGBoost Feature Importance Refinement
    xgb_model = XGBRegressor(random_state=42)
    xgb_model.fit(X_train_mi, y_train)
    xgb_importances = pd.Series(xgb_model.feature_importances_, index=X_train_mi.columns)
    print("delete, these are the importancea", xgb_importances)

    # Apply XGBoost threshold to further refine features
    xgb_selected_features = xgb_importances[xgb_importances > t].index
    X_train_xgb = X_train_mi[xgb_selected_features]
    print(f"XGBoost Features Selected: {len(xgb_selected_features)}")

    # Evaluate using cross-validation after XGBoost refinement
    xgb_scores = cross_val_score(XGBRegressor(random_state=42), X_train_xgb, y_train, cv=5, scoring='r2')
    xgb_mean_score = xgb_scores.mean()
    print("delete, this is the score",xgb_mean_score)
    # Update best XGBoost threshold and score and save features
    if xgb_mean_score > best_xgb_score:
        best_xgb_score = xgb_mean_score
        best_xgb_t = t
        best_features_xgb = xgb_selected_features.tolist()


    print(f"XGBoost Mean CV Score: {xgb_mean_score:.4f}")
    print('-' * 30)

# Final Results
print(f"Best MI Threshold: {best_mi_t}, Best MI Mean CV Score: {best_mi_score:.4f}")
print(f"Best XGBoost Threshold: {best_xgb_t}, Best XGBoost Mean CV Score: {best_xgb_score:.4f}")

In [None]:
# Select Best Features from Training and Test Sets
X_train_final = X_train[best_features_xgb]
X_test_final = X_test[best_features_xgb]

print("\nFeatures selected by Mutual Information and XGBoost:\n" + "\n".join(best_features_xgb))

In [None]:
X_train_final.shape, X_test_final.shape

In [None]:
X_train_final.head()

In [None]:
X_test_final.head()

In [None]:
#Harshitha delete

print(data_encoded.columns)

## Linear Regression - Krishna

#### Model Evaluation

In [None]:
# Harshitha's changes included

X_train_final_const = sm.add_constant(X_train)
X_test_final_const = sm.add_constant(X_test)

#Extracting Boolean Columns

boolean=X_train_final_const.select_dtypes(include=['bool']).columns
X_train_final_const=X_train_final_const[boolean].astype(int)


boolean=X_test_final_const.select_dtypes(include=['bool']).columns
X_test_final_const=X_test_final_const[boolean].astype(int)


ols_model = sm.OLS(y_train, X_train_final_const).fit()

y_pred = ols_model.predict(X_test_final_const)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(ols_model.summary(),'\n')
print('='*25)
print("Test Set Performance:")
print('='*25)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R-squared: {ols_model.rsquared:.2f}")
print('='*25)

In [None]:
#Delete-HT
print(X_train.dtypes)

In [None]:
X_train_final_const = sm.add_constant(X_train_final)
X_test_final_const = sm.add_constant(X_test_final)


ols_model = sm.OLS(y_train, X_train_final_const).fit()

y_pred = ols_model.predict(X_test_final_const)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(ols_model.summary(),'\n')
print('='*25)
print("Test Set Performance:")
print('='*25)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R-squared: {ols_model.rsquared:.2f}")
print('='*25)

## Random Forest - Krishna

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_final, y_train)

print("Best Parameters:", grid_search.best_params_)

In [None]:
#Delete Harshitha

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

In [None]:
rf_model = RandomForestRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    random_state=42,
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_split=grid_search.best_params_['min_samples_split'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf']
)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

#### Model Evaluation

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Random Forest Test Performance:")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

## XGBoost - Krishna

In [None]:
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 10, 100]
}

grid_search_xgb = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid_xgb,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

grid_search_xgb.fit(X_train, y_train)

print("Best Parameters for XGBoost:", grid_search_xgb.best_params_)

#### Model Evaluation

## Meta Model

Note: Code for Feedforward model needs to be tested

In [None]:
#Feed Forward Neural Network

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


#Scaling the data

scaler=StandardScaler()




model=Sequential([layers.InputLayer(input_shape=(Xtrain.shape[1],)), layers.Dense(64,activation='relu'), layers.Dense(32,activation='relu'),layers.Dense(1)])

#Comiling the model

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')


#### Model Evaluation

#Code References

https://www.nltk.org/api/nltk.tokenize.punkt.html
https://stackoverflow.com/questions/1801668/convert-a-list-with-strings-all-to-lowercase-or-uppercase
https://stackoverflow.com/questions/55508303/how-to-write-a-list-of-list-into-excel-using-python