In [1]:
import pandas as pd

In [2]:
# series object is a single-column data, or a set of values that correspond to a single variable
# that is why we can create a series object from a list

In [3]:
products = [ "A ", "B ", "C", " D"]
products

['A ', 'B ', 'C', ' D']

In [4]:
type(products)

list

In [5]:
product_categories = pd.Series(products)

In [6]:
product_categories # left values are indexes of series, right is values
# object is default datatype assigned to non-numeric data

0    A 
1    B 
2     C
3     D
dtype: object

In [7]:
type(product_categories) # can use tab to search for variable 

pandas.core.series.Series

In [8]:
type(pd.Series(products))

pandas.core.series.Series

In [9]:
daily_rates_dollars = pd.Series([40,45,50,60])
daily_rates_dollars

0    40
1    45
2    50
3    60
dtype: int64

In [10]:
print(daily_rates_dollars)

0    40
1    45
2    50
3    60
dtype: int64


In [11]:
# pandas series object corresponds to one-dimensional numpy array structure 

In [12]:
import numpy as np

In [13]:
array_a = np.array([10,20,30,40,50])
array_a

array([10, 20, 30, 40, 50])

In [14]:
type(array_a)

numpy.ndarray

In [15]:
series_a = pd.Series(array_a)
series_a

0    10
1    20
2    30
3    40
4    50
dtype: int32

In [16]:
print(series_a)

0    10
1    20
2    30
3    40
4    50
dtype: int32


In [17]:
type(series_a)

pandas.core.series.Series

In [18]:
# 1. pandas series object is something like the powerful version of the python list, or an enhanced version of the NumPy array
# series should not be the preferred choice no matter what: 
# depends on what you want to obtain from data vs speed and precision with which you can do that
# if you can work with series, it means you have a larger set of tools and capabilities that are for pandas library only
# This is becaause the series object stores its values in a sequenced order, and has an explicit index
# only really see advantages when experienced
# 2. Second takeaway is that you should always maintain data consistency 
# always better to have data of a single type only in an array etc 

## Using methods in Python

In [19]:
# almost every entity in python is an entity containing data and metadata and some functionality 
# a python object is associated with a certain collection of attributes and methods 
# attrributes provide the metadata while methods provide functionalities and behavior of the object 
# methods are passive since they can only give data on specifc dataset
# methods are active since they work on data stored in the object 

In [20]:
## both methods and functions, when provided with some initial data,
## both tools can make specific operations with it and return an output
# function is an independent entity since it is not associated with an object on construction
# method of certain package is genreally applied to an object of a certain class
# method can have access to object's data and manipulate the object's state 
# different libraries have own sets of methods (need object to use method), methods can only be used with their library
# certain methods only for pandas series, not other objects 

In [21]:
start_date_deposits = pd.Series({
    '7/4/2014'  : 2000,
    '1/12/2015' : 2000,
    '12/18/2012': 1000,
    '2/20/2015' : 2000,
    '10/28/2013': 2000,
    '4/19/2015' : 2000,
    '7/4/2016'  : 2000,
    '4/24/2014' : 2000,
    '9/3/2015'  : 4000,
    '7/25/2016' : 2000,
    '5/1/2014'  : 2000,
    '3/29/2013' : 2000,
    '10/3/2014' : 2000,
    '9/18/2015' : 2500
})

In [22]:
start_date_deposits

7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
2/20/2015     2000
10/28/2013    2000
4/19/2015     2000
7/4/2016      2000
4/24/2014     2000
9/3/2015      4000
7/25/2016     2000
5/1/2014      2000
3/29/2013     2000
10/3/2014     2000
9/18/2015     2500
dtype: int64

In [23]:
start_date_deposits.sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of 7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
2/20/2015     2000
10/28/2013    2000
4/19/2015     2000
7/4/2016      2000
4/24/2014     2000
9/3/2015      4000
7/25/2016     2000
5/1/2014      2000
3/29/2013     2000
10/3/2014     2000
9/18/2015     2500
dtype: int64>

In [24]:
start_date_deposits.sum()

29500

In [25]:
start_date_deposits.min()  # returns minimum value on vertical axis

1000

In [26]:
start_date_deposits.max() # these methods are not information about but is information from dataset

4000

In [27]:
start_date_deposits.idxmax() # index for max value 

'9/3/2015'

In [28]:
start_date_deposits.idxmin() # index value for smallest value 

'12/18/2012'

In [29]:
# constantly using methods 

In [30]:
# pandas is a library which steps on computational abilities of NumPy

In [31]:
# numpy is good for numeric data only, pandas for both numeric and non-numeric 
## if in situation of needing things for multiple data types, 
## pandas won't deprive you of mathmaetical operations when you need them
# pandas has non-mathematical methods too

In [32]:
start_date_deposits = pd.Series({
    '7/4/2014'  : 2000,
    '1/12/2015' : 2000,
    '12/18/2012': 1000,
    '2/20/2015' : 2000,
    '10/28/2013': 2000,
    '4/19/2015' : 2000,
    '7/4/2016'  : 2000,
    '4/24/2014' : 2000,
    '9/3/2015'  : 4000,
    '7/25/2016' : 2000,
    '5/1/2014'  : 2000,
    '3/29/2013' : 2000,
    '10/3/2014' : 2000,
    '9/18/2015' : 2500
})

In [33]:
start_date_deposits.head() # provides a quick and easy way to determine structure of dataset

7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
2/20/2015     2000
10/28/2013    2000
dtype: int64

In [34]:
start_date_deposits.tail() # last 5 rows 

7/25/2016    2000
5/1/2014     2000
3/29/2013    2000
10/3/2014    2000
9/18/2015    2500
dtype: int64

In [35]:
# we use object's data to get a certain output, not inflexible or restrictive process
# one of best features of methods is we can modify their performance 
# we do so by knowing parameters associated with given method and asssigning relevant arguments on execution


In [36]:
start_date_deposits = pd.Series({
    '7/4/2014'  : 2000,
    '1/12/2015' : 2000,
    '12/18/2012': 1000,
    '2/20/2015' : 2000,
    '10/28/2013': 2000,
    '4/19/2015' : 2000,
    '7/4/2016'  : 2000,
    '4/24/2014' : 2000,
    '9/3/2015'  : 4000,
    '7/25/2016' : 2000,
    '5/1/2014'  : 2000,
    '3/29/2013' : 2000,
    '10/3/2014' : 2000,
    '9/18/2015' : 2500
})

In [37]:
# we can also be specific about the output 
start_date_deposits.head(3)

7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
dtype: int64

In [38]:
start_date_deposits.head(10)

7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
2/20/2015     2000
10/28/2013    2000
4/19/2015     2000
7/4/2016      2000
4/24/2014     2000
9/3/2015      4000
7/25/2016     2000
dtype: int64

In [39]:
start_date_deposits.head() # head() is a parameter, allows us to modify the way in which the method will operate
# option to set number of rows
# called parameter of head method
# the number inside brackets is the argument 
# default argument of method is 5, so no error
# a parameter of a Python method or function always has a name 

7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
2/20/2015     2000
10/28/2013    2000
dtype: int64

In [40]:
start_date_deposits.head(n = 10) 
# can specify deliberately to make it clear to reader what modifications we are making to method
# head just has single parameter
# pandas have parameters to supply with arguments to modify the performance of a given method
# good practice to refer to names explicitly and in right order

7/4/2014      2000
1/12/2015     2000
12/18/2012    1000
2/20/2015     2000
10/28/2013    2000
4/19/2015     2000
7/4/2016      2000
4/24/2014     2000
9/3/2015      4000
7/25/2016     2000
dtype: int64

## pandas series: Unique() and nunique()

In [41]:
##  pandas series object represents a single column data or a set of observations
# #related to a single variable with 1d numpy array structure

In [42]:
# good to back up data cleaning and data manipulation with analytical intuition

In [43]:
data = pd.read_csv('location.csv', squeeze = True)
location_data = data.copy()
location_data.head()

0     Location 3
1     Location 6
2     Location 8
3    Location 26
4    Location 34
Name: Location, dtype: object

In [44]:
type(location_data)

pandas.core.series.Series

In [45]:
location_data.describe() # data analyst would use this for some description about data
# string location 25 is mentioned 31 times

count            1043
unique            296
top       Location 25
freq               31
Name: Location, dtype: object

In [46]:
len(location_data)

1043

In [48]:
location_data.nunique() # number of unique elements

296

In [49]:
type(location_data.nunique()) 

int

In [50]:
location_data.unique() # delivers values in order they appear in dataset

array(['Location 3', 'Location 6', 'Location 8', 'Location 26',
       'Location 34', 'Location 25', 'Location 46', 'Location 156',
       'Location 21', 'Location 13', 'Location 579', 'Location 602',
       'Location 10', 'Location 44', 'Location 30', 'Location 48',
       'Location 196', 'Location 64', 'Location 91', 'Location 62',
       'Location 75', 'Location 42', 'Location 233', 'Location 95',
       'Location 78', 'Location 61', 'Location 87', 'Location 19',
       'Location 115', 'Location 350', 'Location 377', 'Location 17',
       'Location 113', 'Location 81', 'Location 58', 'Location 212',
       'Location 53', 'Location 337', 'Location 41', 'Location 632',
       'Location 73', 'Location 214', 'Location 218', 'Location 38',
       'Location 172', 'Location 197', 'Location 101', 'Location 185',
       'Location 129', 'Location 235', 'Location 142', 'Location 50',
       'Location 76', 'Location 11', 'Location 33', 'Location 22',
       'Location 145', 'Location 203', 'Loca

## Pandas Series: sort_values()

In [None]:
# sometimes need to order data by values of one or a few of the columns not just by index 

In [52]:
numbers = pd.Series([15,1000,23,45,444])
numbers

0      15
1    1000
2      23
3      45
4     444
dtype: int64

In [53]:
numbers.sort_values() # default order is to set values by ascending order

0      15
2      23
3      45
4     444
1    1000
dtype: int64

In [54]:
numbers.sort_values(ascending=True)  # shift and tab combo allows us to check documentation 

0      15
2      23
3      45
4     444
1    1000
dtype: int64

In [55]:
numbers.sort_values(ascending=False)

1    1000
4     444
3      45
2      23
0      15
dtype: int64

In [56]:
data = pd.read_csv('location.csv', squeeze = True) # squeexe data to make a series 
location_data = data.copy() # ensure we have a series dataset free to use and modify
location_data.head()

0     Location 3
1     Location 6
2     Location 8
3    Location 26
4    Location 34
Name: Location, dtype: object

In [57]:
location_data.sort_values() # values are sorted alphabetically and treated as labels not integer

637     Location 1
884     Location 1
465     Location 1
716    Location 10
623    Location 10
          ...     
482    Location 97
128    Location 97
669    Location 97
757    Location 98
372    Location 99
Name: Location, Length: 1043, dtype: object

In [58]:
location_data.sort_values(ascending=True) 

637     Location 1
884     Location 1
465     Location 1
716    Location 10
623    Location 10
          ...     
482    Location 97
128    Location 97
669    Location 97
757    Location 98
372    Location 99
Name: Location, Length: 1043, dtype: object

In [59]:
location_data.sort_values(ascending=False) 

372    Location 99
757    Location 98
128    Location 97
482    Location 97
271    Location 97
          ...     
202    Location 10
298    Location 10
637     Location 1
884     Location 1
465     Location 1
Name: Location, Length: 1043, dtype: object

In [None]:
# index values comply with object's data that will lead the way