# Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [67]:
import pandas as pd

# I NEEDED NUMPY

import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [2]:
file = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'

chipo = pd.read_csv(file, sep='\t')

### Step 4. See the first 10 entries

In [3]:
chipo[:5]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


### Step 5. What is the number of observations in the dataset?

In [70]:
chipo.count()

# Good, but could have also done this

chipo.info()

chipo.shape[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
order_id              4622 non-null int64
quantity              4622 non-null int64
item_name             4622 non-null object
choice_description    3376 non-null object
item_price            4622 non-null float32
dtypes: float32(1), int64(2), object(2)
memory usage: 162.6+ KB


4622

### Step 6. What is the number of columns in the dataset?

In [76]:
chipo.columns
len(chipo.columns)

# fine but could have also done

chipo.shape[1]

5

### Step 7. Print the name of all the columns.

In [6]:
chipo.columns

Index(['order_id', 'quantity', 'item_name', 'choice_description',
       'item_price'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [7]:
chipo.index

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most ordered item?

In [8]:
chipo['item_name'].unique()

array(['Chips and Fresh Tomato Salsa', 'Izze', 'Nantucket Nectar',
       'Chips and Tomatillo-Green Chili Salsa', 'Chicken Bowl',
       'Side of Chips', 'Steak Burrito', 'Steak Soft Tacos',
       'Chips and Guacamole', 'Chicken Crispy Tacos', 'Chicken Soft Tacos',
       'Chicken Burrito', 'Canned Soda', 'Barbacoa Burrito',
       'Carnitas Burrito', 'Carnitas Bowl', 'Bottled Water',
       'Chips and Tomatillo Green Chili Salsa', 'Barbacoa Bowl', 'Chips',
       'Chicken Salad Bowl', 'Steak Bowl', 'Barbacoa Soft Tacos',
       'Veggie Burrito', 'Veggie Bowl', 'Steak Crispy Tacos',
       'Chips and Tomatillo Red Chili Salsa', 'Barbacoa Crispy Tacos',
       'Veggie Salad Bowl', 'Chips and Roasted Chili-Corn Salsa',
       'Chips and Roasted Chili Corn Salsa', 'Carnitas Soft Tacos',
       'Chicken Salad', 'Canned Soft Drink', 'Steak Salad Bowl',
       '6 Pack Soft Drink', 'Chips and Tomatillo-Red Chili Salsa', 'Bowl',
       'Burrito', 'Crispy Tacos', 'Carnitas Crispy Tacos', 'Ste

In [9]:
chipo_items = chipo[['item_name', 'quantity']]

chipo_items_counts = chipo_items.groupby('item_name').aggregate(sum)

chipo_items_counts.sort_values('quantity', ascending=False)[:5]

Unnamed: 0_level_0,quantity
item_name,Unnamed: 1_level_1
Chicken Bowl,761
Chicken Burrito,591
Chips and Guacamole,506
Steak Burrito,386
Canned Soft Drink,351


In [10]:
chipo_item_quantity = chipo.groupby('item_name').aggregate(sum)

top_5_chipo_items = chipo_item_quantity.sort_values('quantity', ascending=False)[:5]

top_5_chipo_items

Unnamed: 0_level_0,order_id,quantity
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicken Bowl,713926,761
Chicken Burrito,497303,591
Chips and Guacamole,449959,506
Steak Burrito,328437,386
Canned Soft Drink,304753,351


### Step 10. How many items were ordered?

In [77]:
chipo[:5]

chipo_quants = chipo['quantity']

chipo_quants.sum()

chipo['quantity'].sum()

# Or do you mean type of items, a count of those?

# len(chipo['item_name'].unique())

4972

### Step 11. What was the most ordered item in the choice_description column?

In [12]:
chipo[:5]

# chipo[chipo['item_name' == 'Canned Soda']]

chipo[['choice_description', 'quantity']].groupby('choice_description').aggregate(sum).sort_values('quantity', ascending=False)[:5]

Unnamed: 0_level_0,quantity
choice_description,Unnamed: 1_level_1
[Diet Coke],159
[Coke],143
[Sprite],89
"[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream, Lettuce]]",49
"[Fresh Tomato Salsa, [Rice, Black Beans, Cheese, Sour Cream]]",42


### Step 12. How many items were orderd in total?

In [15]:
chipo['quantity'].sum()

4972

### Step 13. Turn the item price into a float

In [47]:
# chipo['item_price'] = chipo.item_price.str.replace('$', '')

# chipo['item_price'] = pd.to_numeric(chipo['item_price'], downcast='float')

chipo.dtypes

order_id                int64
quantity                int64
item_name              object
choice_description     object
item_price            float32
dtype: object

In [81]:
# a more elegant way

dollarizer = lambda x: float(x[1:-1])

chipo.item_price = chipo.item_price.apply(dollarizer)

TypeError: 'float' object is not subscriptable

### Step 14. How much was the revenue for the period in the dataset?

In [83]:
chipo['item_price'].aggregate(sum)

# This is wrong. Apparently should have multipled against quantit

revenue = (chipo['quantity'] * chipo['item_price']).sum()
revenue

39237.01973223686

### Step 15. How many orders were made in the period?

In [87]:
# chipo.tail(n=5)

# chipo.order_id.unique()

chipo.order_id.sort_values(ascending=False)[:5]

# or

chipo.order_id.value_counts().count()

1834

### Step 16. What is the average amount per order?

In [65]:
order_prices = chipo[['order_id', 'item_price']]

order_prices = order_prices.groupby('order_id').sum()

order_prices.mean()

# Wrong, but only because of the quantity thing

item_price    18.811426
dtype: float32

### Step 17. How many different items are sold?

In [66]:
len(chipo['item_name'].unique())

50