In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
#importing dataset
df = pd.read_csv(os.path.join('dataset', 
                              'sales_data_sample.csv'),
                 engine='python')

In [3]:
# list all columns
print(df.columns)

Index(['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
       'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
       'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE',
       'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE',
       'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME',
       'DEALSIZE'],
      dtype='object')


In [4]:
# select ordernumber and oderdate columns
print(df[['ORDERNUMBER',
          'ORDERDATE']])

      ORDERNUMBER        ORDERDATE
0           10107   2/24/2003 0:00
1           10121    5/7/2003 0:00
2           10134    7/1/2003 0:00
3           10145   8/25/2003 0:00
4           10159  10/10/2003 0:00
...           ...              ...
2818        10350   12/2/2004 0:00
2819        10373   1/31/2005 0:00
2820        10386    3/1/2005 0:00
2821        10397   3/28/2005 0:00
2822        10414    5/6/2005 0:00

[2823 rows x 2 columns]


In [5]:
# select first 10 rows
print(df[['ORDERNUMBER',
          'ORDERDATE']].head(10))

   ORDERNUMBER        ORDERDATE
0        10107   2/24/2003 0:00
1        10121    5/7/2003 0:00
2        10134    7/1/2003 0:00
3        10145   8/25/2003 0:00
4        10159  10/10/2003 0:00
5        10168  10/28/2003 0:00
6        10180  11/11/2003 0:00
7        10188  11/18/2003 0:00
8        10201   12/1/2003 0:00
9        10211   1/15/2004 0:00


In [6]:
# select last 10 rows
print(df[['ORDERNUMBER',
          'ORDERDATE']].tail(10))

      ORDERNUMBER        ORDERDATE
2813        10293    9/9/2004 0:00
2814        10306  10/14/2004 0:00
2815        10315  10/29/2004 0:00
2816        10327  11/10/2004 0:00
2817        10337  11/21/2004 0:00
2818        10350   12/2/2004 0:00
2819        10373   1/31/2005 0:00
2820        10386    3/1/2005 0:00
2821        10397   3/28/2005 0:00
2822        10414    5/6/2005 0:00


In [7]:
# get details for order# 10350
print(df[['ORDERNUMBER',
          'ORDERDATE']][df['ORDERNUMBER']==10350])

      ORDERNUMBER       ORDERDATE
126         10350  12/2/2004 0:00
955         10350  12/2/2004 0:00
1008        10350  12/2/2004 0:00
1085        10350  12/2/2004 0:00
1310        10350  12/2/2004 0:00
1611        10350  12/2/2004 0:00
1840        10350  12/2/2004 0:00
1964        10350  12/2/2004 0:00
2270        10350  12/2/2004 0:00
2399        10350  12/2/2004 0:00
2426        10350  12/2/2004 0:00
2478        10350  12/2/2004 0:00
2529        10350  12/2/2004 0:00
2609        10350  12/2/2004 0:00
2712        10350  12/2/2004 0:00
2738        10350  12/2/2004 0:00
2818        10350  12/2/2004 0:00


In [8]:
# get distinct year_id
print(df['YEAR_ID'].unique())

[2003 2004 2005]


In [9]:
# Multiple condition select
# select orders for Dec 2004
print(df[(df['MONTH_ID']==12) & (df['YEAR_ID']==2004)]['ORDERNUMBER'])


21      10361
48      10357
75      10361
126     10350
127     10359
        ...  
2763    10352
2764    10361
2790    10352
2791    10361
2818    10350
Name: ORDERNUMBER, Length: 110, dtype: int64


In [10]:
# Sort based on order number
print(df.sort_values(by=['ORDERNUMBER'])[['ORDERNUMBER','ORDERDATE']])


      ORDERNUMBER       ORDERDATE
578         10100   1/6/2003 0:00
2024        10100   1/6/2003 0:00
680         10100   1/6/2003 0:00
1267        10100   1/6/2003 0:00
728         10101   1/9/2003 0:00
...           ...             ...
2405        10425  5/31/2005 0:00
393         10425  5/31/2005 0:00
160         10425  5/31/2005 0:00
780         10425  5/31/2005 0:00
727         10425  5/31/2005 0:00

[2823 rows x 2 columns]


In [11]:
# Descending order
print(df.sort_values(by=['ORDERNUMBER'], ascending=False)[['ORDERNUMBER','ORDERDATE']])


      ORDERNUMBER       ORDERDATE
1667        10425  5/31/2005 0:00
393         10425  5/31/2005 0:00
780         10425  5/31/2005 0:00
679         10425  5/31/2005 0:00
1064        10425  5/31/2005 0:00
...           ...             ...
830         10101   1/9/2003 0:00
1267        10100   1/6/2003 0:00
578         10100   1/6/2003 0:00
2024        10100   1/6/2003 0:00
680         10100   1/6/2003 0:00

[2823 rows x 2 columns]


In [12]:
# Get orders where order number is not 10350 or 10351
print(df[['ORDERNUMBER',
          'ORDERDATE']][~df['ORDERNUMBER'].isin([10350,10351])])


      ORDERNUMBER        ORDERDATE
0           10107   2/24/2003 0:00
1           10121    5/7/2003 0:00
2           10134    7/1/2003 0:00
3           10145   8/25/2003 0:00
4           10159  10/10/2003 0:00
...           ...              ...
2817        10337  11/21/2004 0:00
2819        10373   1/31/2005 0:00
2820        10386    3/1/2005 0:00
2821        10397   3/28/2005 0:00
2822        10414    5/6/2005 0:00

[2801 rows x 2 columns]


In [13]:
# Get total number of sales yearwise
print(df.groupby(by=['YEAR_ID']).size())


YEAR_ID
2003    1000
2004    1345
2005     478
dtype: int64


In [14]:
# Get total quantity of orders yearwise
print(df.groupby(by=['YEAR_ID']).sum()['QUANTITYORDERED'])


YEAR_ID
2003    34612
2004    46824
2005    17631
Name: QUANTITYORDERED, dtype: int64


In [15]:
# Get maximum, minimum, average and total number of order in a day yearwise
print(df.groupby(by=['YEAR_ID'])['QUANTITYORDERED'].agg(['max', 'min', 'mean', 'count']))


         max  min       mean  count
YEAR_ID                            
2003      50   20  34.612000   1000
2004      55   20  34.813383   1345
2005      97    6  36.884937    478


In [16]:
# Get total quantity of orders yearwise and arrange it in ascending order.
print(df.groupby(by=['YEAR_ID']).sum().sort_values(by=['QUANTITYORDERED'])['QUANTITYORDERED'])


YEAR_ID
2005    17631
2003    34612
2004    46824
Name: QUANTITYORDERED, dtype: int64


In [17]:
# Get total quantity of orders yearwise and print if orderquantity is greater than 30000
grp = df.groupby(by=['YEAR_ID']).sum()
print(grp[grp['QUANTITYORDERED'] > 30000]['QUANTITYORDERED'])


YEAR_ID
2003    34612
2004    46824
Name: QUANTITYORDERED, dtype: int64
