In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

plt.style.use('bmh')


['ramen-ratings.csv']


In [2]:
df = pd.read_csv("../input/ramen-ratings.csv")
print(df.shape)
df.head(5)

(2580, 7)


Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,


### 1. Preprocessing the data

In [3]:
df.info()
df.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 7 columns):
Review #    2580 non-null int64
Brand       2580 non-null object
Variety     2580 non-null object
Style       2578 non-null object
Country     2580 non-null object
Stars       2580 non-null object
Top Ten     41 non-null object
dtypes: int64(1), object(6)
memory usage: 141.2+ KB


Unnamed: 0,Review #,Brand,Variety,Style,Country,Stars,Top Ten
count,2580.0,2580,2580,2578,2580,2580.0,41
unique,,355,2413,7,38,51.0,38
top,,Nissin,Chicken,Pack,Japan,4.0,\n
freq,,381,7,1531,352,384.0,4
mean,1290.5,,,,,,
std,744.926171,,,,,,
min,1.0,,,,,,
25%,645.75,,,,,,
50%,1290.5,,,,,,
75%,1935.25,,,,,,


#### 1.1 Data types and completeness
* Some column name have spaces between words
* __Style__ missing 2 values indicated by its 2578 count while others are 2580  
* __Stars__ are currently stored as object whilst it seems to provide more context if stored as float 
* __Top Ten__ seems to indicate whether or not the ramen has been to a "Top" meals or not
* __Top Ten__ has most frequent value of '\n' indicating there are rows with meaningless value


In [4]:
columns = ['Style','Country','Stars','Top Ten']
for col in columns : 
    if df[col].dtypes == "object" : print(col,":" ,df[col].unique().tolist(),"\n")

Style : ['Cup', 'Pack', 'Tray', 'Bowl', 'Box', 'Can', 'Bar', nan] 

Country : ['Japan', 'Taiwan', 'USA', 'India', 'South Korea', 'Singapore', 'Thailand', 'Hong Kong', 'Vietnam', 'Ghana', 'Malaysia', 'Indonesia', 'China', 'Nigeria', 'Germany', 'Hungary', 'Mexico', 'Fiji', 'Australia', 'Pakistan', 'Bangladesh', 'Canada', 'Nepal', 'Brazil', 'UK', 'Myanmar', 'Netherlands', 'United States', 'Cambodia', 'Finland', 'Sarawak', 'Philippines', 'Sweden', 'Colombia', 'Estonia', 'Holland', 'Poland', 'Dubai'] 

Stars : ['3.75', '1', '2.25', '2.75', '4.75', '4', '0.25', '2.5', '5', '4.25', '4.5', '3.5', 'Unrated', '1.5', '3.25', '2', '0', '3', '0.5', '4.00', '5.0', '3.50', '3.8', '4.3', '2.3', '5.00', '3.3', '4.0', '3.00', '1.75', '3.0', '4.50', '0.75', '1.25', '1.1', '2.1', '0.9', '3.1', '4.125', '3.125', '2.125', '2.9', '0.1', '2.8', '3.7', '3.4', '3.6', '2.85', '3.2', '3.65', '1.8'] 

Top Ten : [nan, '2016 #10', '2016 #1', '2016 #8', '2016 #5', '2016 #9', '2016 #7', '2015 #10', '2015 #7', '2015 #4

#### 1.2 Investigate categories of each observable 'object' columns
* __Style__ has nan value which is meaningless. We know from earlier observation that there are only 2 of them. We can just do nothing about it since its not sizeable but it's worth checking what are the other values of these 2 rows
* __Country__ column seems normal
* __Starts__ has 'Unrated' value which is later will cause problem when we're converting it to float. Thus it shall somewhat convert to other value before converting. It's also interesting to know how whether it has a sizeable presence or not
* __Top Ten__ indicates 2 information which are the year and rank of the noodle. For later ease of exploration, we can split this column into 2. We'll also convert the '\n' into blank

#### 1.3 Operations
So, putting things together, some of procedures will be excuted during preprocesing are
1. Add underscore for column name that has space
2. __Stars:__ Replace 'Unrated' to '-1' and change its data type to float
3. __Top Ten:__ replace '\n' to blank and break the column into 2 (year and rank)

In [5]:
#df = pd.read_csv("../input/ramen-ratings.csv")
# Store preprocess dataframe to df 1
df1=df.copy()

In [6]:
# Fix spaces in column name
df1.columns = [c.replace(' ', '_') for c in df1.columns]

# Stars - Replace 'Unrated'
df1.Stars = df1['Stars'].replace(to_replace='Unrated',value='-1')

# Stars - Convert data type from object to float
df1.Stars = df1.Stars.astype(float)

# Top Ten - Replace '\n'
df1.Top_Ten = df1.Top_Ten.replace(to_replace='\n',value=np.nan)

# Top Ten - Slice column into 2 and drop the column 
df1[['Topten_Year','Topten_Rank']] = df1['Top_Ten'].str.split('#', expand=True)
df1 = df1.drop('Top_Ten', axis=1)

# Top Ten - Set rank as float 
df1.Topten_Rank = df1.Topten_Rank.astype(float)

In [7]:
df1[df1.Topten_Rank.notnull()].head(5)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 8 columns):
Review_#       2580 non-null int64
Brand          2580 non-null object
Variety        2580 non-null object
Style          2578 non-null object
Country        2580 non-null object
Stars          2580 non-null float64
Topten_Year    37 non-null object
Topten_Rank    37 non-null float64
dtypes: float64(2), int64(1), object(5)
memory usage: 161.4+ KB


In [8]:
df1.head(10)

Unnamed: 0,Review_#,Brand,Variety,Style,Country,Stars,Topten_Year,Topten_Rank
0,2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,,
1,2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,,
2,2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,,
3,2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,,
4,2576,Ching's Secret,Singapore Curry,Pack,India,3.75,,
5,2575,Samyang Foods,Kimchi song Song Ramen,Pack,South Korea,4.75,,
6,2574,Acecook,Spice Deli Tantan Men With Cilantro,Cup,Japan,4.0,,
7,2573,Ikeda Shoku,Nabeyaki Kitsune Udon,Tray,Japan,3.75,,
8,2572,Ripe'n'Dry,Hokkaido Soy Sauce Ramen,Pack,Japan,0.25,,
9,2571,KOKA,The Original Spicy Stir-Fried Noodles,Pack,Singapore,2.5,,


### 2. Exploratory
To explore, I am curious to know : <br>
`1.` What are the average variety and ratings for each brand? <br>
`2.` Which country has the highest competition? <br>
`3.` How strict a country judge a Ramen? <br>
`4.` What is the leading global brand? <br>

In [9]:
# Preparing data 
df1_brand_histograms= df1.groupby(['Brand']).agg({'Variety':pd.Series.nunique,'Stars':'mean'})

# Generate plot and styling 1 out of 2
fig = go.Figure(data=go.Histogram(x=df1_brand_histograms["Variety"],
                 marker=dict(color="rgba(20,189,204,0.2)",line=dict(color='#14BDCC', width=1)),
                 hoverinfo="x+y+z+text",
                 name = "Variety",
                 hoverlabel = dict(bgcolor="rgba(20,189,204,1)",bordercolor="rgba(20,189,204,0.5)",
                       font=dict(family="Arial",size=10,color='rgba(255,255,255,1)'))
                ))

# Update xaxis properties
fig.update_xaxes(title=dict(font=dict(size=12, color='#1A1817')),
                 ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10,hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=False, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson')
fig.update_xaxes(title_text="<b>VARIETY</b> (total variety)")

# Update yaxis properties
fig.update_yaxes(title= dict(font=dict(size=12, color='#1A1817'),),ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10, hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson',)
fig.update_yaxes(title_text="<b>BRAND</b> (total)")

fig.update_layout(    
    font=dict(family="Raleway, sans-serif", size=12, color='#98928E'),
    plot_bgcolor="#fffaf7",
    showlegend=False,
    paper_bgcolor = "#fffaf7",
    height = 500,
    annotations=[go.layout.Annotation(text="<b>Most brand has at most 4 variety</b>",x=-0.05,y=1.18,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family=" Raleway,sans-serif", size=16, color='#1A1817')),
                 go.layout.Annotation(text="Total brands by total variety of ramen",x=-0.05,y=1.12,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family='Raleway,sans-serif', size=14, color='#98928E'))]
)
fig.show()

In [10]:
# Generate plot and styling 2 out of 2
fig = go.Figure(data=go.Histogram(x=df1_brand_histograms["Stars"],
                 cumulative_enabled=True,
                marker=dict(color="rgba(20,189,204,0.2)",line=dict(color='#14BDCC', width=1)),
                 showlegend=False,
                 hoverinfo="x+y+z+text",
                 hoverlabel = dict(bgcolor="rgba(20,189,204,1)",bordercolor="rgba(20,189,204,0.5)",
                       font=dict(family="Arial",size=10,color='rgba(255,255,255,1)'))
                ))

# Update xaxis properties
fig.update_xaxes(title=dict(font=dict(size=12, color='#1A1817')),
                 ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10,hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=False, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson')
fig.update_xaxes(title_text="<b>STARS</b> (average ratings)")

# Update yaxis properties
fig.update_yaxes(title= dict(font=dict(size=12, color='#1A1817'),),ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10, hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson',)
fig.update_yaxes(title_text="<b>BRAND</b> (total cummulative)")

fig.update_layout(    
    font=dict(family="Raleway, sans-serif", size=12, color='#98928E'),
    plot_bgcolor="#fffaf7",
    showlegend=False,
    paper_bgcolor = "#fffaf7",
    height = 500,
    annotations=[go.layout.Annotation(text="<b>Most brand rated less than 4</b>",x=-0.05,y=1.18,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family=" Raleway,sans-serif", size=16, color='#1A1817')),
                 go.layout.Annotation(text="Total brands (cummulative) by average ratings",x=-0.05,y=1.12,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family='Raleway,sans-serif', size=14, color='#98928E')),]
)
fig.show()

#### 2.1 What are the average variety and ratings for each brand?

Anchoring at brand, we can see that most brands have at most 4 variety with one outlier brand that has almost ~400 variety. Reviewers seems also rather selective in rating that most ramen are below 4.

In [11]:
# Preparing data
df2 = df1.groupby('Country').agg({'Variety':pd.Series.nunique,
                                  'Brand':pd.Series.nunique, 
                                  'Stars':'mean',
                                  'Review_#':['mean','sum'],
                                  
                                 })
df2.columns =df2.columns.get_level_values(0)+"_"+df2.columns.get_level_values(1)
df3 = df1.groupby(['Country','Brand']).agg({
                                  'Stars':'mean',
                                  'Review_#':'mean'
                                 })

# Generate plot and styling
colorscale1 = [
[0,'rgba(0,204,204,.5)'],
[0.5,'rgba(0,102,102,.5)'], 
[1,'rgba(0,25,51,.5)']]

colorscale1_line = [
[0,'rgba(0,204,204,1)'],
[0.5,'rgba(0,102,102,1)'], 
[1,'rgba(0,25,51,1)']]

fig = go.Figure(data=  go.Scatter(
        x=df2["Variety_nunique"],
        y=df2["Brand_nunique"],
        name ="Country",
        mode="markers",
        text = df2.index,
        marker=dict(
            color="rgba(20,189,204,0.2)",
            size=15,
            line=dict(color='#14BDCC', width=1)),
        hoverinfo="x+y+z+text",
        hoverlabel = dict(
                       bgcolor="rgba(20,189,204,1)",
                       bordercolor="rgba(20,189,204,0.5)",
                       font=dict(
                           family="Arial", 
                           size=10, 
                           color='rgba(255,255,255,1)'))))


# Update xaxis properties
fig.update_xaxes(title=dict(font=dict(size=12, color='#1A1817')),
                 ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10,hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson')

fig.update_xaxes(title_text="<b>VARIETY</b> (total)")

# Update yaxis properties
fig.update_yaxes(title= dict(font=dict(size=12, color='#1A1817'),),ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10, hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson',)

fig.update_yaxes(title_text="<b>BRAND</b> (total)")

fig.update_layout(    
    font=dict(family="Raleway, sans-serif", size=12, color='#98928E'),
    plot_bgcolor="#fffaf7",
    showlegend=True,
    paper_bgcolor = "#fffaf7",
    height = 500,
    annotations=[go.layout.Annotation(text="<b>Brand and variety are proportional with 1:5 ratio</b>",x=-0.05,y=1.18,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family=" Raleway,sans-serif", size=16, color='#1A1817')),
                 go.layout.Annotation(text="Total brands and ramen varieties by country",x=-0.05,y=1.12,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family='Raleway,sans-serif', size=14, color='#98928E')),]
)
fig.show()

#### 2.2 Which country has the highest competition?
Competition level here is identified by both the number of brands and varieties. As you might expect, Japan has the highest number of brands, 58 and types, 333. The number of the brand to variety ratio seems to be proportional, at least for the top 10.



In [12]:
# Preparing data
df2 = df1.groupby('Country').agg({'Variety':pd.Series.nunique,
                                  'Brand':pd.Series.nunique, 
                                  'Stars':'mean',
                                  'Review_#':['mean','sum'],
                                  
                                 })
df2.columns =df2.columns.get_level_values(0)+"_"+df2.columns.get_level_values(1)
df3 = df1.groupby(['Country','Brand']).agg({
                                  'Stars':'mean',
                                  'Review_#':'mean'
                                 })

# Generate plot and styling
colorscale1 = [
[0,'rgba(0,204,204,.5)'],
[0.5,'rgba(0,102,102,.5)'], 
[1,'rgba(0,25,51,.5)']]

colorscale1_line = [
[0,'rgba(0,204,204,1)'],
[0.5,'rgba(0,102,102,1)'], 
[1,'rgba(0,25,51,1)']]

fig = go.Figure(data=  go.Scatter(x=df2["Review_#_mean"],
               y=df2["Stars_mean"],
               mode="markers",
               name="Country",
               hoverinfo="x+y+z+text",
               text = df2.index,
               marker=dict(color=(df2["Review_#_sum"]), size=15,
                           colorbar=dict(title= dict(text="<b>TOTAL<br>REVIEWS</b>",font=dict(size=12, color='#1A1817'),),x=1.02, y=0.95,yanchor="top", len=1, ),colorscale=colorscale1,
                           line=dict(color=df2["Review_#_sum"],width=1,colorscale=colorscale1_line)), 
               
              ))



# Update xaxis properties
fig.update_xaxes(title=dict(font=dict(size=12, color='#1A1817')),
                 ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10,hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson')
fig.update_xaxes(title_text="<b>REVIEWS</b> (average per ramen)")

# Update yaxis properties
fig.update_yaxes(title= dict(font=dict(size=12, color='#1A1817'),),ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10, hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson',)
fig.update_yaxes(title_text="<b>STARS</b> (rating)")

fig.update_layout(    
    font=dict(family="Raleway, sans-serif", size=12, color='#98928E'),
    plot_bgcolor="#fffaf7",
    showlegend=True,
    paper_bgcolor = "#fffaf7",
    height = 500,
    annotations=[go.layout.Annotation(text="<b>Most countries with high total reviews, generously rate ramens</b>",x=-0.05,y=1.18,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family=" Raleway,sans-serif", size=16, color='#1A1817')),
                 go.layout.Annotation(text="Countries average stars and reviews color coded by total reviews",x=-0.05,y=1.12,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family='Raleway,sans-serif', size=14, color='#98928E')),]
)
fig.show()

#### 2.3 How strict a country judge a Ramen?
Since the ratings are 0–5, most countries with high total reviews, indicated by darker green color, tend to have average scores above 3, predominantly above 3.5.

In [13]:
# Preparing data 
top_brand = df1.groupby('Brand').count()['Variety'][df1.groupby('Brand').count()['Variety']>20].index.values # Listing brands with 20+ variety
df1_top_brand =  df1.loc[df1['Brand'].isin(top_brand)] # Filtering the brand
df1_brand_heatmaps = df1_top_brand.groupby(['Country','Brand']).agg({'Stars':'mean','Review_#':'mean'})

# Generate plot and styling
fig = go.Figure(data=go.Heatmap(
        x=df1_brand_heatmaps.index.get_level_values(0),
        y=df1_brand_heatmaps.index.get_level_values(1),
        z=df1_brand_heatmaps['Stars'],
    colorbar=dict(title= dict(text='<b>RATINGS</b><br>(average)',font=dict(size=12, color='#1A1817'),) )

))
# Update xaxis properties
fig.update_xaxes(title=dict(text="<b>COUNTRY</b>",font=dict(size=12, color='#1A1817')),
                 ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10,hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson')


# Update yaxis properties
fig.update_yaxes(title= dict(text = "<b>BRAND</b>",font=dict(size=12, color='#1A1817'),),ticks="outside", tickwidth=0.5, tickcolor='#F2E9E1', ticklen=10, hoverformat=",.1f",
                 showline=True, linewidth=2, linecolor='#F2E9E1',
                 showgrid=True, gridwidth=1, gridcolor='#F2E9E1',
                 zeroline=True, zerolinewidth=1, zerolinecolor='crimson',)

fig.update_layout(    
    xaxis_nticks=len(df1_brand_heatmaps.index.get_level_values(0)),
    yaxis_nticks=len(df1_brand_heatmaps.index.get_level_values(1)),
    font=dict(family="raleway, sans-serif", size=10, color='#98928E'),
    plot_bgcolor="#fffaf7",
    showlegend=True,
    paper_bgcolor = "#fffaf7",
    height = 500,
    annotations=[go.layout.Annotation(text="<b>Nissin has the most presence in contries</b>",x=-0.05,y=1.18,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family=" Raleway,sans-serif", size=16, color='#1A1817')),
                 go.layout.Annotation(text="Average rating by brand and country",x=-0.05,y=1.12,xref="paper",yref="paper",showarrow=False,
                                      xanchor="left",yanchor="top",align="left",font=dict(family='Raleway,sans-serif', size=14, color='#98928E')),]
                 )

fig.show()

#### 2.4 What is the leading global brand?
To answer this, I am using the brand's presence in countries regardless of their rating. Nissin has the most present worldwide though the ratings do not seem to be outstanding. Nissin has mostly average ratings, orange to red color, with even one low score, purple indicating rating as little as 2 out of 5.
Couple worth-noting things are,
- My Kuali is astonishingly scored ~5 (look for the yellow-colored box) in all countries it presents, but bear in mind; this average rating doesn't take into account the total number of reviews. Look forward to trying this ramen!
- Indomie doesn't seem to perform well in Nigeria. 