In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno

pd.options.display.max_rows = 150
pd.options.display.max_columns = 150

In [None]:
listing = pd.read_csv('listings.csv')
listing_clean = pd.read_csv('listing_clean.csv')
reviews = pd.read_csv('reviews.csv')


In [None]:
msno.matrix(listing_clean)

# QUESTION N°1 ->

In [None]:
# QUESTION N°1

quartiers = listing_clean.groupby(by="neighbourhood_cleansed")
nb_review = quartiers["number_of_reviews"].sum()
nb_host = quartiers['id'].count()
question1 = pd.DataFrame({'nb_host': nb_host, 'nb_review': nb_review})
question1

plt.figure(figsize=(10, 8))

plt.bar(question1.index, question1.nb_review)
plt.bar(question1.index, question1.nb_host)


plt.legend()
plt.title("reviews by neighbourhood_cleansed")
plt.xlabel("Quartier")
plt.xticks(rotation=90)
plt.legend(question1,loc=2)

# QUESTION N°2 ->

In [None]:
# QUESTION N°2

mean_response = listing_clean["host_response_rate"].str.rstrip('%').astype(float).mean()
mean_response

mean_accept = listing_clean["host_acceptance_rate"].str.rstrip('%').astype(float).mean()
mean_accept

question2 = pd.DataFrame({'mean_response': [mean_response], 'mean_accept': [mean_accept]}, index=["% par host"])
question2


In [None]:
plt.figure(figsize=(6,6))

plt.pie([question2.mean_response[0], 100-question2.mean_response[0]], autopct='%1.1f%%', labels=["mean response", "No response"])

plt.show()

In [None]:
plt.figure(figsize=(6,6))
plt.pie([question2.mean_accept[0], 100-question2.mean_accept[0]], autopct='%1.1f%%', labels=["mean accept", "No response"])
plt.show()

# QUESTION N°3 ->

In [None]:
# QUESTION N°3

phone_verification = listing_clean["host_verifications"].apply(lambda x: 1 if "phone" in x else 0).sum() / listing["host_verifications"].count() * 100
work_email_verification = listing_clean["host_verifications"].apply(lambda x: 1 if "work_email" in x else 0).sum() / listing["host_verifications"].count() * 100
email_verification = listing_clean["host_verifications"].apply(lambda x: 1 if "email" in x and "work_email" != x else 0).sum() / listing["host_verifications"].count() * 100

question3 = pd.DataFrame({'phone_verification': [phone_verification], 'work_email_verification': [email_verification], 'email_verification' : [email_verification]}, index=["% par host"])
question3

In [None]:

plt.figure(figsize=(8,6))

plt.bar(question3.columns, question3.values[0])
plt.ylabel("host %")
plt.xlabel("Type of Verification")
plt.show()

# QUESTION N°4 ->

In [None]:
# QUESTION N°4

amenities = listing_clean['amenities'].apply(lambda x: str(x)[1:-1].replace('"', '').replace("\\u2013", "-")).str.split(pat=",")
listing_clean['amenities'] = amenities

listing_clean["nb_amenities"] = listing_clean["amenities"].apply(lambda x: len(x))

grouped_room_type = listing_clean[["room_type","nb_amenities"]].groupby(['room_type']).agg(['mean', 'std'])
grouped_room_type




In [None]:
plt.figure(figsize=(9,6))

plt.bar(grouped_room_type['nb_amenities'].index, grouped_room_type['nb_amenities']["mean"], yerr=grouped_room_type['nb_amenities']["std"], capsize=4)

plt.ylabel("mean of Amenities")
plt.xlabel("Type of room")
plt.show()

# QUESTION N°5 ->

In [None]:
# QUESTION N°5
listing_clean["price"] = listing_clean["price"].apply(lambda x: str(x).replace(",", "").replace("$", "")).astype(float)

prix = listing_clean[["room_type","price"]].groupby(["room_type"]).describe()
prix = prix['price'].drop(['count', 'std', 'mean'], axis=1)
prix


In [None]:
plt.figure(figsize=(9,6))

plt.boxplot(prix)
plt.show()

# QUESTION N°6 ->

In [None]:
# QUESTION N°6

listing["bathrooms_text"].value_counts()
bathrooms = listing["bathrooms_text"]
nb_bathrooms = listing["bathrooms"]
bathrooms = bathrooms.apply(lambda x: "1 {}".format(x) if x == "Shared half-bath" or x == "Half-bath" or x == "Private half-bath" else x).apply(lambda x: str(x)+"s" if str(x)[len(str(x))-1].lower() != "s" else str(x)).replace("nans", 0)
bathrooms = bathrooms.str.split(' ', 1, expand=True)
bathrooms = bathrooms.rename(columns={0: "Number", 1: "Type"})
bathrooms["Number"] = bathrooms["Number"].astype(float)

def convertiseur(df):
    """
    Fonction qui multiplie les chiffres de la première colonne en fonction de la deuxième colonne
    : param df : DataFrame
    : return : DataFrame modifié
    """
    result = 0.0
    df[1] = str(df[1]).lower()
    if df[1] == "shared baths" or df[1] == "bath shareds" or df[1] == "half-baths":
        result=df[0]*0.5
    elif df[1] == "private baths":
        result=df[0]*2
    elif df[1] == "half-shareds" or df[1] == "shared half-baths":
        result=df[0]*0.25
    else :
        result = df[0]*1
    return result

bathrooms["Number"] = bathrooms.apply(convertiseur,axis=1)
bathrooms = bathrooms.groupby("Type").value_counts().reset_index().rename(columns={0:"count"}).drop("Type", axis=1).groupby("Number").sum()
bathrooms = bathrooms.groupby("Number").sum().reset_index()
bathrooms


In [None]:
bathrooms.plot.bar(x="Number",y="count") 

# QUESTION N°7  ->

In [None]:

# QUESTION N°8

listing_clean['len_description'] = listing_clean['description'].apply(lambda x: len(str(x)))
corr = listing_clean['len_description'].corr(listing_clean['number_of_reviews'])
corr



In [None]:
x = listing_clean['len_description']
y = listing_clean['number_of_reviews']

plt.scatter(x, y)
plt.xlabel("Length of Description")
plt.ylabel("Number of Reviews")
plt.title("Correlation between Description Length and Number of Reviews")
plt.show()


# QUESTION N°8 ->

In [None]:
fake_reviews = listing.merge(reviews, left_on='id', right_on='listing_id')
fake_reviews = fake_reviews[["host_name","host_id", "reviewer_name", "reviewer_id"]].loc[fake_reviews["host_name"] == fake_reviews["reviewer_name"]].drop_duplicates()

fake_ADS = (len(fake_reviews) / len(reviews)) * 100


In [None]:
fake_reviews

In [None]:
Same_Name = fake_reviews[["host_name", "reviewer_name",]].loc[fake_reviews["host_name"] == fake_reviews["reviewer_name"]].drop_duplicates()

In [None]:
Same_Name