# Getting started with data

In [1]:
import pandas as pd
import numpy as np

## Step 1: Import Dataset

In [2]:
df = pd.read_csv("chipotle.tsv", sep = "\t")

In [3]:
df.head(5)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


## Step 2: Dataset Overview

In [4]:
df.shape

(4622, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4622 entries, 0 to 4621
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            4622 non-null   int64 
 1   quantity            4622 non-null   int64 
 2   item_name           4622 non-null   object
 3   choice_description  3376 non-null   object
 4   item_price          4622 non-null   object
dtypes: int64(2), object(3)
memory usage: 180.7+ KB


In [6]:
print(list(df.columns))
print(tuple(df.columns))

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']
('order_id', 'quantity', 'item_name', 'choice_description', 'item_price')


In [7]:
df.index

RangeIndex(start=0, stop=4622, step=1)

In [8]:
df.describe()

Unnamed: 0,order_id,quantity
count,4622.0,4622.0
mean,927.254868,1.075725
std,528.890796,0.410186
min,1.0,1.0
25%,477.25,1.0
50%,926.0,1.0
75%,1393.0,1.0
max,1834.0,15.0


## loc and iloc

In [9]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


In [10]:
df.loc[(df.order_id < 3000) & (df.quantity == 15), ["order_id", "quantity"]]

Unnamed: 0,order_id,quantity
3598,1443,15


In [11]:
print(df.iloc[[2]])
print(df.iloc[2])

   order_id  quantity         item_name choice_description item_price
2         1         1  Nantucket Nectar            [Apple]     $3.39 
order_id                             1
quantity                             1
item_name             Nantucket Nectar
choice_description             [Apple]
item_price                      $3.39 
Name: 2, dtype: object


In [12]:
x = df.iloc[1:5, :-1]
y = df.iloc[1:5, -1]
print(y)

1     $3.39 
2     $3.39 
3     $2.39 
4    $16.98 
Name: item_price, dtype: object


## Data Manipulation

In [13]:
df.item_price.dtype

dtype('O')

## Apply()


In [14]:
df.item_price = df.item_price.apply(lambda x : float(x.replace('$', '')))

In [15]:
df.head()

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98


In [16]:
df["total_price"] = df["quantity"] * df["item_price"]

In [17]:
df.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,2.39
1,1,1,Izze,[Clementine],3.39,3.39
2,1,1,Nantucket Nectar,[Apple],3.39,3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,33.96
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,10.98
6,3,1,Side of Chips,,1.69,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25,9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,9.25


## The revenue for the period in the dataset?

In [18]:
revenue = df.total_price.sum()
print(revenue)

39237.02


## Which was the most ordered item?

## Group By

In [23]:
ordered_item = df.groupby("item_name")["item_name"]
ordered_item
# ordered_item.sort_values(ascending = False)

  ordered_item = df.groupby("item_name").sum()


Unnamed: 0_level_0,order_id,quantity,item_price,total_price
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6 Pack Soft Drink,52322,55,356.95,369.93
Barbacoa Bowl,53972,66,672.36,672.36
Barbacoa Burrito,74718,91,894.75,894.75
Barbacoa Crispy Tacos,5613,12,120.21,138.71
Barbacoa Salad Bowl,9708,10,106.4,106.4
Barbacoa Soft Tacos,18725,25,250.46,250.46
Bottled Water,175944,211,302.56,649.18
Bowl,472,4,29.6,74.0
Burrito,1550,6,44.4,44.4
Canned Soda,76396,126,137.34,191.84


## Unique Value

In [20]:
df.item_name.value_counts().count()

50

In [21]:
df.item_name.nunique()

50