# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
#import pandas package
import pandas as pd
#import numpy package
import numpy as np

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [2]:
#name url. It is a tab delimiated file (a file containing tabs that separate information with one record per line)
data = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"
#import data
#still use read_csv
#but we will explicitly tell the method that the separator is the tab character and not a comma which is the default
chipo = pd.read_csv(data, sep = "\t")

### Step 4. See the first 10 entries

In [51]:
chipo.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,revenue
0,1,1,Chips and Fresh Tomato Salsa,,2.39,2.39
1,1,1,Izze,[Clementine],3.39,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,33.96
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,10.98
6,3,1,Side of Chips,,1.69,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25,9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,9.25


### Step 5. What is the number of observations in the dataset?

In [5]:
#Solution 1
#shape[0] gives the number of rows
#shape [1] gives the number of columns
chipo.shape[0]

4622

In [50]:
#Solution 2
chipo.info()
# .info() gives:
    #number of columns
    #column labels
    #column data type
    #number of cells in each column (non-null values)
    #range index
    #memory usage
    #Note: the info() method actually prints the info.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   order_id            4622 non-null   int64  
 1   quantity            4622 non-null   int64  
 2   item_name           4622 non-null   object 
 3   choice_description  3376 non-null   object 
 4   item_price          4622 non-null   float64
 5   revenue             4622 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 216.8+ KB


### Step 6. What is the number of columns in the dataset?

In [49]:
#shape[0] gives the number of rows
#shape [1] gives the number of columns
#calling with brackets requires an input
chipo.shape[1]
#calling without brackets gives the rows then the columns
    #chipo.shape

6

### Step 7. Print the name of all the columns.

In [52]:
chipo.columns
#gives the column labels of the data frame

Index(['order_id', 'quantity', 'item_name', 'choice_description', 'item_price',
       'revenue'],
      dtype='object')

### Step 8. How is the dataset indexed?

In [53]:
chipo.index
#give the start, stop, and the step

RangeIndex(start=0, stop=4622, step=1)

### Step 9. Which was the most-ordered item? 

In [57]:
#groups data by selected column
m = chipo.groupby('item_name')
#sums the values of all columns
m = m.sum()
#sorts the values based on the quantity
    #ascending = False makes the data be displayed in descending order
m = m.sort_values(by = 'quantity', ascending = False)
m.head(1)
#chicken bowl

Unnamed: 0_level_0,order_id,quantity,item_price,revenue
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chicken Bowl,713926,761,7342.73,8044.63


In [62]:
t = chipo['item_name'].value_counts()
t.head()
#Does not work because the value counts is just the number of line entries but does not count lines that have more than just one 

Chicken Bowl           726
Chicken Burrito        553
Chips and Guacamole    479
Steak Burrito          368
Canned Soft Drink      301
Name: item_name, dtype: int64

### Step 10. For the most-ordered item, how many items were ordered?

In [65]:
#group data based on 'item_name' column and sum the values
m = chipo.groupby('item_name').sum()
#sorts values by the quantity in descending order
m = m.sort_values(by = 'quantity', ascending = False)
m.head(1)
#761

Unnamed: 0_level_0,order_id,quantity,item_price,revenue
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chicken Bowl,713926,761,7342.73,8044.63


### Step 11. What was the most ordered item in the choice_description column?

In [13]:
c = chipo.groupby('choice_description').sum()
c = c.sort_values(by = 'quantity', ascending = False)
c.head(1)
#Diet Coke

Unnamed: 0_level_0,order_id,quantity
choice_description,Unnamed: 1_level_1,Unnamed: 2_level_1
[Diet Coke],123455,159


### Step 12. How many items were orderd in total?

In [14]:
chipo['quantity'].sum()
#4972

4972

In [15]:
chipo.quantity.sum()

4972

### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [16]:
chipo.item_price.dtype

dtype('O')

#### Step 13.b. Create a lambda function and change the type of item price

In [17]:
makeFloat = lambda x: float(x[1:-1])
chipo.item_price = chipo.item_price.apply(makeFloat)

#### Step 13.c. Check the item price type

In [18]:
chipo.item_price.dtype

dtype('float64')

### Step 14. How much was the revenue for the period in the dataset?

In [66]:
#Creates revenue column by multiplying two columns together
revenue = (chipo['quantity']*chipo['item_price']).sum()

print("$" + str(np.round(revenue,2)))


$39237.02


Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,revenue
0,1,1,Chips and Fresh Tomato Salsa,,2.39,2.39
1,1,1,Izze,[Clementine],3.39,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,33.96


### Step 15. How many orders were made in the period?

In [27]:
#Solution 1
t = chipo.order_id
t.tail(1)
#1834

4621    1834
Name: order_id, dtype: int64

In [68]:
#Solution 2
many = chipo.order_id.value_counts().count()
many

1834

### Step 16. What is the average revenue amount per order?

In [33]:
# Solution 1
revenue = (chipo['quantity']*chipo['item_price']).sum()
rev = (np.round(revenue,2))
many = chipo.order_id.value_counts().count()
avg = rev/many
print (avg)

21.39423118865867


In [43]:
chipo['revenue'] = chipo['quantity'] * chipo['item_price']
orderGroup = chipo.groupby(by=['order_id']).sum()
orderGroup.mean()['revenue']

21.394231188658654

In [42]:
# Solution 2
chipo.groupby(by=['order_id']).sum().mean()['revenue']


21.394231188658654

### Step 17. How many different items are sold?

In [46]:
chipo.item_name.value_counts().count()

50