In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import geopy
import io

#Run this cell and upload the ramen-ratings.csv file from your computer
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
ramen = pd.read_csv(io.StringIO(uploaded['ramen-ratings.csv'].decode('utf-8')))
#ramen = pd.read_csv('./ramen-ratings.csv')
ramen


In [None]:
# Quick review of column subsetting and data types:
# Extract and print only the Variety and Country columns from the ramen dataset.
print('Ramen:',ramen[['Variety', 'Country']])
print(ramen[['Stars']])

In [None]:
# Run this describe function to see a summary of the data. Compare to the printed
# DataFrame above.
# Which columns did you expect to be numeric? Which ones are actually numeric?

ramen.describe()
print('Ramen describe:',ramen.describe())

# To see what the data types are of each column in the DataFrame.
ramen.dtypes
print('Ramen Type:', ramen.dtypes)

In [None]:
# Expect 'Stars' colunm is numeric. To attempt to convert the column to
# a numeric one. It will not work.
#pd.to_numeric(ramen['Stars'])

In [None]:
# To convert to numeric, we need to tell pandas what to do if the value cannot
# be converted to a numeric value, for example if the value is "Unrated".
# Look at this documentation and select the option to convert non-numeric items to a NaN or missing value.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html
# Hint: it will look like this:
pd.to_numeric(ramen['Stars'],errors='coerce')


In [None]:
# Once the line above works and gives you a Series output, save to a new column
# in the data frame called "rating".
# Hint: See "Adding a column" in the tutorial

ramen['Rating'] = pd.to_numeric(ramen['Stars'],errors='coerce')
print('Ramen:', ramen)

In [None]:
# Run the describe command to see what the updated numeric variables are like.
# Answer the question: From the describe output, how many values of Stars were missing?

ramen.describe()
print('Ramen describe:',ramen.describe())


# To see what the data types are of each column in the DataFrame.
ramen.dtypes
print('Ramen Type:', ramen.dtypes)



In [None]:
#Use seaborn to plot a histogram of the ramen ratings with 10 bins.
plt.figure(figsize = (15,8))
ax = sns.histplot(x="Rating", bins=20, data=ramen)
ax.set_ylabel('Frequency')
ax.set_xlabel('Rating')
ax.set_title("Histogram of the rating of the ramen")
plt.show()



In [None]:
#how many suvery per country.

print(ramen.groupby(['Country']).size())

In [None]:
print(ramen.groupby(['Country']).count())

In [None]:

# how many ratings there are for each Country.
print(ramen.groupby(['Country'])['Rating'].count())

In [None]:
# Notice that there are both United States and USA listed.
# Correct it by running this line of code:
ramen['Country'] = ramen['Country'].str.replace('USA','United States')
print(ramen['Country'])

In [None]:
# With the data cleaned, re-run the code you wrote 2 cells up giving the number of counts per country,

ramen.groupby(['Country']).size()
print(ramen.groupby(['Country']).size())
print(type(ramen.groupby(['Country']).size()))





In [None]:

# saving it to a variable called count_by_countries. Print the variable name to check it.
# Notice this gives us a Series, not a DataFrame.

count_by_countries = pd.DataFrame(ramen.groupby(['Country']).size())
print(count_by_countries)
print(type(count_by_countries))
count_by_countries

In [None]:
"""
Notice that the output is a bit strange: the country is in bold, and depending on the strategy you chose above,
the count may be labeled "Country". Earlier in this lab, we learned that the index of a row is not necessarily the row number;
it is really a row identifier. We converted one of the columns to a row identifier, or index, using set_index().

Here we're in the opposite situation: the index contains the information that we actually want in a column.
We will convert the index to a column using reset_index().
"""
count_by_countries.reset_index()

print(count_by_countries.reset_index())
count_by_countries


In [None]:
# We see that reset_index() showed us what a copy of the DataFrame would look like
# if we used the function, but it did not save changes to the DataFrame.
# To save changes, we'll add this in between the parentheses:
# inplace=True
# Keep this in mind; this option for inplace occurs in many other pandas methods
# as well.

count_by_countries.reset_index(inplace=True)
count_by_countries



In [None]:
# Depending on how you counted the number of ratings per country, the column names
# may now be off. Let's rename them appropriately so we can all work from the same
# point. Run this code to fix it and check:
count_by_countries.columns = ['Country','count']
print(count_by_countries.columns)
print(count_by_countries)

In [None]:
"""

In this section, we'll look at:
- Merging (i.e. joining) two data frames
- Another split-apply-combine example
- Ordering a bar plot by values

Extract data for the countries that make the most ramen and create a barplot

Suppose we want to take a closer look at only the countries that produce a lot of raman. Let's filter out the countries that have > 100 ramen ratings.

There are probably many ways to do this, and you are welcome to think up your  own, if you wish. I will walk you through one.
"""

In [None]:
# Now use row selection to select all lines where the count is greater than 100.
# Save this to a variable called ramen_countries.

ramen_countries = count_by_countries[count_by_countries['count'] > 100]
print(ramen_countries)

In [None]:
# Join this data frame with the original data frame to retain only countries
# with more than 100 ramen varieties.
# We will put ramen on the left and ramen_countries on the right, like this:
# ramen.merge(ramen_countries,how='',on='')
# You will determine what goes into each of those quotations.
# How: if we only want the rows from the countries in ramen_countries, how
# should we do this join? Options: inner, outer, left, right
# On: Which column name is in common between these datasets that we want to use to tell
# pandas how to join the data?

ramen_top_countries = ramen.merge(ramen_countries,how='inner',on='Country')
print(ramen_top_countries)

In [None]:
# Check your work by running this line summing the count of ramen_countries.
# This number should *almost* match the number of rows above (differing perhaps
# by a small number of missing ratings).

numOfRows = len(ramen.index)
print('Number of Rows in ramen : ' , numOfRows)
numOfRows = len(ramen_top_countries.index)
print('Number of Rows in ramen_top_countries : ' , numOfRows)

In [None]:
# Use the seaborn barplot command to make a bar plot of rating by country from
# the ramen_top_countries data.
# Choose the best orientation of x vs y so that you can read the labels.
# Also include the option ci=None in the parentheses.
# In frequentist statistics, a confidence interval (CI) is a range of estimates for an unknown parameter.
# To remove the confidence interval by setting the ci parameter to None

plt.figure(figsize = (15,8)) # Use the matplotlib.pyplot.figure() Function to Change the Size of a Seaborn Plot
ax = sns.barplot(x='Country', y='count', ci=None, data=ramen_top_countries)
ax.set_ylabel("Count")
ax.set_xlabel("Country'")
ax.set_title("Bar plot of rating by country")
plt.show()


In [None]:
# We would prefer to have this ordered.
# We want to get a DataFrame with the country, rating, and (for later use) count.
# Spoiler alert: if you want to figure this out yourself, don't keep reading this cell.

# Otherwise, let's walk through this together.

# We want one line per country, so group by country like this:
ramen_top_countries.groupby("Country")

# Here's an option that prevents the need for all the index resetting:
ramen_top_countries.groupby("Country",as_index=False)

# We want the rating and count for each country, so select those columns.
# Remember that we need two sets of brackets to select multiple columns, because
# the inner brackets are giving the list of columns.
ramen_top_countries.groupby("Country",as_index=False)[['Rating','count']]

# We want the mean rating for each country. (We want the actual count, but that value
# is already repeated in every row, so the mean of all the same number will just be
# that number.) Apply the mean function to the code so far.
# ramen_top_countries.groupby("Country",as_index=False)[['rating','count']].mean()

pd.DataFrame(ramen_top_countries.groupby("Country",as_index=False)[['Rating','count']].mean())


In [None]:
# Now sort this by the rating. We want it descending, so we'll set ascending to False.
# Save this as ramen_rating_by_country.

ramen_rating_by_country = pd.DataFrame(ramen_top_countries.groupby("Country",as_index=False)[['Rating','count']].mean()).sort_values('Rating',ascending=False)
ramen_rating_by_country

In [None]:
# Create a barplot of country vs. rating using ramen_rating_by_country.
plt.figure(figsize = (15,8))
ax = sns.barplot(x='Country', y='Rating', ci=None, data=ramen_rating_by_country)
ax.set_ylabel("Mean Rating")
ax.set_xlabel("Country'")
ax.set_title("Bar plot of Mean Rating by country")
plt.show()

In [None]:
# Use regplot to create a scatterplot of rating vs. count, which is essentially
# quality vs. quantity. Regplot will automatically apply a best-fit line.
plt.figure(figsize = (15,8))
ax = sns.regplot(x='Rating', y='count', ci=None, data=ramen_rating_by_country)
ax.set_xlabel("Rating")
ax.set_ylabel("Count")
ax.set_title("Reg plot of Mean Rating by country")
plt.show()

# Based on the plot above, is there a quality vs. quantity tradeoff?
# In other words, does the mean rating seem to be related to the number of types
# of ramen that the country produces?
# (Consider the correlation of these variables, or the algebra of the best-fit line.)