# Airfare Countvectorizer and Cosine Similarities Performed.

In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df = pd.read_csv('../data/flight_data_clean.csv')

In [6]:
df.head()

Unnamed: 0,year,city1,city2,fare,state1,state2
0,2021,Albuquerque,Boston,324.97,NM,MA (Metropolitan Area)
1,2021,Albuquerque,Boston,315.9,NM,MA (Metropolitan Area)
2,2021,Albuquerque,Boston,329.22,NM,MA (Metropolitan Area)
3,2021,Albuquerque,Washington,255.89,NM,DC (Metropolitan Area)
4,2021,Albuquerque,Washington,291.16,NM,DC (Metropolitan Area)


In [7]:
df.shape

(16099, 6)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16099 entries, 0 to 16098
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    16099 non-null  int64  
 1   city1   16099 non-null  object 
 2   city2   16099 non-null  object 
 3   fare    16099 non-null  float64
 4   state1  16099 non-null  object 
 5   state2  16099 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 754.8+ KB


# Airfare Recommender System.

Followed link provided from Hank to CountVectorize and get Cosine Similarities for a recommender system.  (365datascience.com).  This code provides the similarities between city1 (departure city) and city2 (arrival city) which can be later used with budget input.

In [10]:
#removing duplicates
df.duplicated(subset='city1').sum()

16003

In [11]:
df = df.drop_duplicates(subset='city1')

In [12]:
#checking that duplicates were dropped
df.duplicated(subset='city1').sum()

0

In [13]:
#dropping columns not needed
df.drop(['year', 'state1', 'state2'], axis=1, inplace=True)

In [14]:
def clean_text(city1):
    result = str(city1).lower()
    return(result.replace(' ', ''))

In [15]:
df['city1'] = df['city1'].apply(clean_text)

In [16]:
df.head()

Unnamed: 0,city1,city2,fare
0,albuquerque,Boston,324.97
22,nantucket,New York City,165.27
26,coloradosprings,Washington,251.59
34,dallas/fortworth,Denver,134.1
81,pittsburgh,San Francisco,297.78


In [17]:
df['city2'] = df['city2'].str.lower()

In [18]:
df2 = df.drop(['fare'], axis=1)
df2['data'] = df2[df2.columns[0:]].apply( 
    lambda x: ' '.join(x.dropna().astype(str)), axis=1)

print(df2['data'].head())

0             albuquerque boston
22       nantucket new york city
26    coloradosprings washington
34       dallas/fortworth denver
81      pittsburgh san francisco
Name: data, dtype: object


In [19]:
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(df2['data'])

In [20]:
similarities = cosine_similarity(vectorized)

In [21]:
print(similarities)

[[1.   0.   0.   ... 0.   0.   0.  ]
 [0.   1.   0.   ... 0.   0.75 0.  ]
 [0.   0.   1.   ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 1.   0.   0.  ]
 [0.   0.75 0.   ... 0.   1.   0.  ]
 [0.   0.   0.   ... 0.   0.   1.  ]]


In [22]:
df = pd.DataFrame(similarities, columns=df['city1'], index=df['city1']).reset_index()

In [23]:
df.tail()

city1,city1.1,albuquerque,nantucket,coloradosprings,dallas/fortworth,pittsburgh,huntsville,albany,denver,atlanta,...,dayton,everett,tulsa,spokane,hiltonhead,bellingham,bangor,richmond,charlottesville,eureka/arcata
91,bellingham,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.666667,0.0,0.0
92,bangor,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.75,0.0,1.0,0.0,0.75,0.0
93,richmond,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,1.0,0.0,0.0
94,charlottesville,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.75,0.0,0.75,0.0,1.0,0.0
95,eureka/arcata,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
df.dtypes

city1
city1                object
albuquerque         float64
nantucket           float64
coloradosprings     float64
dallas/fortworth    float64
                     ...   
bellingham          float64
bangor              float64
richmond            float64
charlottesville     float64
eureka/arcata       float64
Length: 97, dtype: object

In [25]:
input_city = 'richmond'
recommendations = pd.DataFrame(df.nlargest(5,input_city)['city1'])

In [26]:
print(recommendations)

           city1
93      richmond
4     pittsburgh
37        eugene
71  philadelphia
72   palmsprings


In [27]:
df['city1'].unique()

array(['albuquerque', 'nantucket', 'coloradosprings', 'dallas/fortworth',
       'pittsburgh', 'huntsville', 'albany', 'denver', 'atlanta',
       'austin', 'asheville', 'tucson', 'phoenix', 'hartford', 'seattle',
       'birmingham', 'elpaso', 'cleveland', 'nashville', 'boise',
       'boston', 'burlington', 'buffalo', 'bozeman', 'chicago',
       'charleston', 'charlotte', 'columbus', 'st.louis', 'myrtlebeach',
       'jacksonville', 'detroit', 'desmoines', 'houston', 'orlando',
       'panamacity', 'valparaiso', 'eugene', 'keywest', 'kalispell',
       'minneapolis/st.paul', 'newyorkcity', 'fortmyers',
       'greenville/spartanburg', 'grandrapids', 'greensboro/highpoint',
       'lasvegas', 'indianapolis', 'jackson', 'jackson/vicksburg',
       'sanfrancisco', 'miami', 'losangeles', 'littlerock', 'louisville',
       'cincinnati', 'sacramento', 'tampa', 'kansascity', 'sanantonio',
       'memphis', 'omaha', 'milwaukee', 'madison', 'neworleans',
       "martha'svineyard", 'sandiego'