## Flight Fare & Airbnb Recommender System

Creating recommender systems that can easily be entered into VS code for Streamlit App.

In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df = pd.read_csv('../data/flight_data_clean.csv')

In [6]:
df.head()

Unnamed: 0,year,city1,city2,fare,state1,state2
0,2021,Albuquerque,Boston,324.97,NM,MA (Metropolitan Area)
1,2021,Albuquerque,Boston,315.9,NM,MA (Metropolitan Area)
2,2021,Albuquerque,Boston,329.22,NM,MA (Metropolitan Area)
3,2021,Albuquerque,Washington,255.89,NM,DC (Metropolitan Area)
4,2021,Albuquerque,Washington,291.16,NM,DC (Metropolitan Area)


In [7]:
df.shape

(16099, 6)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16099 entries, 0 to 16098
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    16099 non-null  int64  
 1   city1   16099 non-null  object 
 2   city2   16099 non-null  object 
 3   fare    16099 non-null  float64
 4   state1  16099 non-null  object 
 5   state2  16099 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 754.8+ KB


First go at trying to create a recommender system.  This was followed up with a CountVectorizer and Cosine Similarities added. 

In [10]:
#recommends the top 5 cities with the lowest fares within a given budget range from a specified starting city.
def recommend_city(df, starting_city, budget_range):
    
    min_fare, max_fare = budget_range
    
#filter fare to be within budget range
    fare_filter = (df['fare'] >= min_fare) & (df['fare'] <= max_fare)
    filtered_df = df[fare_filter]
    
#filter so starting_city is city1
    city_filter = (filtered_df['city1'] == starting_city)
    filtered_df = filtered_df[city_filter]
    
#sort by fare
    sorted_df = filtered_df.sort_values(by='fare')
    
#select the top 5 city2(destination recommendations) with lowest fares, and no repeated recommended cities
    recs = sorted_df.drop_duplicates(subset=['city2']).head(5)
    
#creates new column for recommended city
    recs['recommended_city'] = recs['city2']
    
#selects and reorders columns to provide the recommended city and fare
    recs = recs[['city1', 'recommended_city', 'fare']]
    
    return recs

In [11]:
#preprocessing the data
df_re = df[['city1', 'city2', 'fare']].dropna(subset=['fare'])

df_re['fare'] = pd.to_numeric(df_re['fare'], errors='coerce')

df_re.dropna(inplace=True)

In [12]:
# testing with Albuquerque and budget range of 100-200
starting_city = "Albuquerque"
budget_range = (100, 200)
recs = recommend_city(df_re, starting_city, budget_range)
print(recs)

            city1 recommended_city    fare
5094  Albuquerque      Los Angeles  130.34
4209  Albuquerque    San Francisco  144.28
4021  Albuquerque          Chicago  146.63
5029  Albuquerque    New York City  154.33
4199  Albuquerque       Washington  171.82


In [13]:
#second city testing
starting_city = "Chicago"
budget_range = (250, 300)
recs = recommend_city(df_re, starting_city, budget_range)
print(recs)

         city1 recommended_city    fare
14409  Chicago          Seattle  250.07
12375  Chicago      Los Angeles  250.42
11354  Chicago    New York City  250.72
10328  Chicago    San Francisco  250.78
15470  Chicago        Las Vegas  251.81


In [14]:
df['city1'].value_counts()

city1
Los Angeles          1172
Boston               1115
Chicago               786
Dallas/Fort Worth     741
Houston               711
                     ... 
Bangor                  8
Hilton Head             5
Eureka/Arcata           4
Tulsa                   3
Spokane                 3
Name: count, Length: 96, dtype: int64

# Recommender System with CountVectorizer Added & Cosine Similarities on Flight Fare Dataset
#### Cell utilized to combine and copy and past into VS code for Streamlit

In [16]:
#list of cities to keep that line up with Airbnb dataset
cities_to_keep = [
    'Austin', 'Boston', 'Chicago', 'Columbus', 'Denver', 'Ft Lauderdale', 
    'honolulu', 'Jersey City', 'Las Vegas', 'Los Angeles', 'Nashville', 
    'New Orleans', 'New York City', 'Portland', 'Rhode Island', 
    'San Diego', 'San Francisco', 'Seattle', 'Minneapolis', 'Washington']

#filter the DataFrame to keep only the specified cities in the 'city2' column
df = df[df['city2'].isin(cities_to_keep)]

#define the recommendation function
def recommend_destinations(starting_city, min_budget, max_budget, num_recommendations=5):
#filter the DataFrame based on the budget range
    filtered_df = df[(df['fare'] >= min_budget) & (df['fare'] <= max_budget)]
    
#create a list of all unique destination city
    destination_city = list(filtered_df['city2'].unique())
    
#vcectorize city2 names with CountVectorizer
    cv = CountVectorizer().fit_transform(destination_city)
    vectors = cv.toarray()
    
#calculate cosine similarity between the starting city and all other cities
    city_index = destination_city.index(starting_city)
    cosine_similarities = cosine_similarity([vectors[city_index]], vectors).flatten()
#create a DataFrame with cities and their similarity scores
    similarity_df = pd.DataFrame({
        'city': destination_city,
        'similarity': cosine_similarities})
    
#merge with the filtered DataFrame to keep only relevant destinations
    result_df = pd.merge(filtered_df, similarity_df, left_on='city2', right_on='city')
    
    result_df.to_csv('../data/fare_result_df.csv', index=False)

#sort by similarity score and remove duplicates
    result_df = result_df.sort_values(by='similarity', ascending=False).drop_duplicates(subset='city2')
#get the recommended cities and their fares
    recommended_cities = result_df[['city2', 'fare']].values
    
#randomly selects 5 cities from the recommended cities
    if len(recommended_cities) > num_recommendations:
        recommended_cities = recommended_cities[np.random.choice(len(recommended_cities), num_recommendations, replace=False)]
    
    return recommended_cities

Two tests completed below to confirm code above is working, later to be copied and pasted into streamlit. 

In [18]:
#input variables
starting_city = 'Chicago'
min_budget = 200
max_budget = 350

recommended_cities = recommend_destinations(starting_city, min_budget, max_budget)

In [19]:
print(f'Recommended cities and their fares from {starting_city}:')
for city, fare in recommended_cities:
    print(f"{city}: ${fare}")

Recommended cities and their fares from Chicago:
Los Angeles: $205.45
New Orleans: $335.52
San Diego: $218.41
Nashville: $212.87
Las Vegas: $307.14


In [20]:
#input variables, checking another city. 
starting_city = 'Boston'
min_budget = 100
max_budget = 350

recommended_cities = recommend_destinations(starting_city, min_budget, max_budget)

In [21]:
print(f'Recommended cities and their fares from {starting_city}:')
for city, fare in recommended_cities:
    print(f"{city}: ${fare}")

Recommended cities and their fares from Boston:
Boston: $211.35
New Orleans: $124.38
Denver: $134.1
Los Angeles: $205.45
San Francisco: $194.36


### Recommender System on Airbnb Dataset based on Input City and Min/Max Budget. 
#### This to work with the recommender system on flight fare.

In [23]:
bnb = pd.read_csv('../data/air_bnb_data_clean.csv')

In [24]:
bnb.head()

Unnamed: 0,room_id,neighbourhood,room_type,room_price,minimum_nights,updated_date,city,state
0,575758,Five Points,Entire home/apt,200,180,2020-06-28,Denver,CO
1,1041934,Berkeley,Entire home/apt,215,30,2020-06-28,Denver,CO
2,1311993,CBD,Entire home/apt,90,30,2020-06-28,Denver,CO
3,1557739,Cole,Private room,60,30,2020-06-28,Denver,CO
4,3338717,Rosedale,Entire home/apt,90,2,2020-06-28,Denver,CO


In [25]:
bnb.shape

(193740, 8)

In [26]:
def simple_recommend_rooms(city, min_budget, max_budget, bnb):
#filter the dataset based on the user input
    filtered_bnb = bnb[(bnb['city'].str.lower() == city.lower()) &
                         (bnb['room_price'] >= min_budget) &
                         (bnb['room_price'] <= max_budget)].reset_index(drop=True)
    
#sort by room price to get the most relevant results within the budget
    filtered_bnb = filtered_bnb.sort_values(by='room_price')
    
#get top 5 rooms
    top_5_rooms = filtered_bnb.head(5)
    
#select the relevant columns to display
    result_bnb = top_5_rooms[['room_id', 'neighbourhood', 'room_price', 'city']]
    
    return result_bnb

In [27]:
#testing
city_input = "Denver"
min_budget_input = 50
max_budget_input = 100

In [28]:
recommended_bnb = simple_recommend_rooms(city_input, min_budget_input, max_budget_input, bnb)

recommended_bnb

Unnamed: 0,room_id,neighbourhood,room_price,city
778,39181811,Virginia Village,50,Denver
1532,32354216,Five Points,50,Denver
1536,33704620,Gateway - Green Valley Ranch,50,Denver
1541,37360309,Skyland,50,Denver
793,5379218,Hampden South,50,Denver
