In [179]:
import os
import pandas as pd

In [180]:
os.chdir(r'C:\Users\tanzh\Documents\sandbox folder for python\sample files')

occurrencePandas Functions
================
* pd.concat([df1, df2]) --- to combine DataFrames


DataFrame Methods & Functions
=============================
* df.describe() --- to return the columns' statistics
* df.info() --- to return high level summary of the columns
* df.isnull() --- return a DataFrame with Boolean values testing if cell is na
* df.sample(frac=0.5) --- to randomly return 50% of the data
* df.shape --- to return the rows and columns 
* df.count() --- to return the number of non-zero items in each column
* df.values --- to generate a 2D numpy array
* df.memory_usage(deep=True) --- if deep=True, it will retrieve the true memory usage for string/object datatype else it will retrieve the size of the pointer
* df.select_dtypes(include=[['datetime', 'object']], exclude=[['float']]) --- to include/exclude columns of defined datatypes
* df[['col1']].rolling(5).mean()
* df.duplicated(keep=False) --- return a Boolean series where ALL occurrence of duplicated records will display True

Drop Functions
--------------
* df.drop([['col1']], axis=1, inplace=True) --- to remove data
* df.dropna(axis=0, thresh=4, inplace=False) --- require at least 4 non-NA items otherwise remove row, use subset parameter if targeting specific column
* df.drop_duplicates(keep=False, ignore_index=True) --- if keep=False, drop all duplicates, if keep=first, drop first duplicate occurrence








# Pandas Basics

In [181]:
# how to read files into pandas dataframe
data = pd.read_csv('tesla_stock_yahoo.csv', parse_dates=['Date'])

###########
# if the csv file is very large, we can divide the file into chunks to load into a pandas dataframe
data_chunks = pd.read_csv('tesla_stock_yahoo.csv', parse_dates=['Date'], chunksize=50) # chunks of 50 rows of data
data = pd.concat(data_chunks)

data.head(10) # if no parameter is passed through, the top 5 rows in the table will be returned 

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,,,,,23.889999,18766300.0
1,2010-06-30,,30.42,23.299999,23.83,23.83,17187100.0
2,2010-01-07,25.0,,20.27,21.959999,21.959999,8218800.0
3,2010-02-07,23.0,23.1,,19.200001,19.200001,5139800.0
4,2010-06-07,20.0,20.0,15.83,,16.110001,6866900.0
5,2010-07-07,16.4,16.629999,14.98,15.8,,6921700.0
6,2010-08-07,16.139999,17.52,15.57,17.459999,17.459999,
7,2010-09-07,17.58,17.9,16.549999,17.4,17.4,4050600.0
8,2010-12-07,17.950001,18.07,17.0,17.049999,17.049999,2202500.0
9,2010-07-13,17.389999,18.639999,16.9,18.139999,18.139999,2680100.0


In [182]:
data.info() # provides high level information on the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793 entries, 0 to 1792
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       1793 non-null   datetime64[ns]
 1   Open       1791 non-null   float64       
 2   High       1791 non-null   float64       
 3   Low        1791 non-null   float64       
 4   Close      1791 non-null   float64       
 5   Adj Close  1792 non-null   float64       
 6   Volume     1792 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 98.2 KB


In [183]:
data.describe() # provide high level

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1791.0,1791.0,1791.0,1791.0,1792.0,1792.0
mean,143.051703,145.4922,140.45206,143.040888,142.97457,4444361.0
std,101.900712,103.316433,100.316972,101.884265,101.894485,4306755.0
min,16.139999,16.629999,14.98,15.8,16.110001,118500.0
25%,30.564999,31.355001,29.730001,30.66,30.6325,1262525.0
50%,176.160004,180.470001,171.199997,177.110001,176.96,3417250.0
75%,227.209999,230.794999,222.860001,226.940002,226.915001,5964200.0
max,386.690002,386.98999,379.350006,383.450012,383.450012,37163900.0


In [184]:
data.sample(frac=1) # 0 <= frac <= 1

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
639,2013-09-01,34.009998,34.189999,33.400002,33.639999,33.639999,698000.0
476,2012-05-15,30.260000,30.959999,29.219999,29.430000,29.430000,1585700.0
1034,2014-05-08,237.470001,242.990005,235.690002,238.490005,238.490005,5388600.0
186,2011-03-22,22.730000,22.860001,22.000000,22.190001,22.190001,582900.0
1152,2015-01-23,200.289993,203.500000,198.330002,201.289993,201.289993,3438600.0
...,...,...,...,...,...,...,...
1013,2014-07-07,227.500000,229.779999,220.399994,222.660004,222.660004,5893700.0
286,2011-12-08,25.600000,27.139999,25.360001,26.309999,26.309999,1009100.0
1423,2016-02-22,170.119995,178.910004,169.850006,177.740005,177.740005,5060100.0
1556,2016-08-30,216.110001,216.110001,210.520004,211.339996,211.339996,3168900.0


In [218]:
data['Adj Close'].rolling(5).mean() # aggregation method {mean, median, sum, min, max}

0              NaN
1              NaN
2              NaN
3              NaN
4        20.998000
           ...    
1788    327.692004
1789    330.218005
1790    334.586005
1791    340.926007
1792    350.056006
Name: Adj Close, Length: 1793, dtype: float64

In [187]:
data.sort_values(by=['Close'], ascending=False, inplace=False)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1761,2017-06-23,382.450012,386.989990,379.350006,383.450012,383.450012,6445800.0
1760,2017-06-22,377.989990,385.000000,373.570007,382.609985,382.609985,7529800.0
1754,2017-06-14,381.089996,384.250000,376.309998,380.660004,380.660004,12818400.0
1762,2017-06-26,386.690002,386.950012,373.100006,377.489990,377.489990,6604100.0
1759,2017-06-21,374.350006,376.989990,368.019989,376.399994,376.399994,4923200.0
...,...,...,...,...,...,...,...
7,2010-09-07,17.580000,17.900000,16.549999,17.400000,17.400000,4050600.0
8,2010-12-07,17.950001,18.070000,17.000000,17.049999,17.049999,2202500.0
5,2010-07-07,16.400000,16.629999,14.980000,15.800000,,6921700.0
0,2010-06-29,,,,,23.889999,18766300.0


In [188]:
data.select_dtypes(include=['datetime']) # only columns with datetime datatype will be kept

Unnamed: 0,Date
0,2010-06-29
1,2010-06-30
2,2010-01-07
3,2010-02-07
4,2010-06-07
...,...
1788,2017-02-08
1789,2017-03-08
1790,2017-04-08
1791,2017-07-08


In [190]:
data.dropna(axis=0, inplace=False, thresh=4) # the row need to be a minimum of 4 non-NA otherwise it will be removed

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1,2010-06-30,,30.420000,23.299999,23.830000,23.830000,17187100.0
2,2010-01-07,25.000000,,20.270000,21.959999,21.959999,8218800.0
3,2010-02-07,23.000000,23.100000,,19.200001,19.200001,5139800.0
4,2010-06-07,20.000000,20.000000,15.830000,,16.110001,6866900.0
5,2010-07-07,16.400000,16.629999,14.980000,15.800000,,6921700.0
...,...,...,...,...,...,...,...
1788,2017-02-08,318.940002,327.119995,311.220001,325.890015,325.890015,13091500.0
1789,2017-03-08,345.329987,350.000000,343.149994,347.089996,347.089996,13535000.0
1790,2017-04-08,347.000000,357.269989,343.299988,356.910004,356.910004,9198400.0
1791,2017-07-08,357.350006,359.480011,352.750000,355.170013,355.170013,6276900.0


In [191]:
data.drop_duplicates()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,,,,,23.889999,18766300.0
1,2010-06-30,,30.420000,23.299999,23.830000,23.830000,17187100.0
2,2010-01-07,25.000000,,20.270000,21.959999,21.959999,8218800.0
3,2010-02-07,23.000000,23.100000,,19.200001,19.200001,5139800.0
4,2010-06-07,20.000000,20.000000,15.830000,,16.110001,6866900.0
...,...,...,...,...,...,...,...
1788,2017-02-08,318.940002,327.119995,311.220001,325.890015,325.890015,13091500.0
1789,2017-03-08,345.329987,350.000000,343.149994,347.089996,347.089996,13535000.0
1790,2017-04-08,347.000000,357.269989,343.299988,356.910004,356.910004,9198400.0
1791,2017-07-08,357.350006,359.480011,352.750000,355.170013,355.170013,6276900.0


In [192]:
help(data.drop_duplicates)

Help on method drop_duplicates in module pandas.core.frame:

drop_duplicates(subset: Union[Hashable, Sequence[Hashable], NoneType] = None, keep: Union[str, bool] = 'first', inplace: bool = False, ignore_index: bool = False) -> Union[ForwardRef('DataFrame'), NoneType] method of pandas.core.frame.DataFrame instance
    Return DataFrame with duplicate rows removed.
    
    Considering certain columns is optional. Indexes, including time indexes
    are ignored.
    
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns.
    keep : {'first', 'last', False}, default 'first'
        Determines which duplicates (if any) to keep.
        - ``first`` : Drop duplicates except for the first occurrence.
        - ``last`` : Drop duplicates except for the last occurrence.
        - False : Drop all duplicates.
    inplace : bool, default False
        Whethe

In [207]:
data.drop_duplicates(keep=False, ignore_index=True)[:20]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,,,,,23.889999,18766300.0
1,2010-06-30,,30.42,23.299999,23.83,23.83,17187100.0
2,2010-01-07,25.0,,20.27,21.959999,21.959999,8218800.0
3,2010-02-07,23.0,23.1,,19.200001,19.200001,5139800.0
4,2010-06-07,20.0,20.0,15.83,,16.110001,6866900.0
5,2010-07-07,16.4,16.629999,14.98,15.8,,6921700.0
6,2010-08-07,16.139999,17.52,15.57,17.459999,17.459999,
7,2010-09-07,17.58,17.9,16.549999,17.4,17.4,4050600.0
8,2010-12-07,17.950001,18.07,17.0,17.049999,17.049999,2202500.0
9,2010-07-13,17.389999,18.639999,16.9,18.139999,18.139999,2680100.0


In [209]:
data[data.duplicated(keep='last')]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
12,2010-07-16,20.700001,21.299999,20.049999,20.639999,20.639999,2621300.0
14,2010-07-19,21.370001,22.25,20.92,21.91,21.91,2486500.0


In [213]:
help(data.duplicated)

Help on method duplicated in module pandas.core.frame:

duplicated(subset: Union[Hashable, Sequence[Hashable], NoneType] = None, keep: Union[str, bool] = 'first') -> 'Series' method of pandas.core.frame.DataFrame instance
    Return boolean Series denoting duplicate rows.
    
    Considering certain columns is optional.
    
    Parameters
    ----------
    subset : column label or sequence of labels, optional
        Only consider certain columns for identifying duplicates, by
        default use all of the columns.
    keep : {'first', 'last', False}, default 'first'
        Determines which duplicates (if any) to mark.
    
        - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
        - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
        - False : Mark all duplicates as ``True``.
    
    Returns
    -------
    Series



In [215]:
data[data.duplicated(keep=False)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
12,2010-07-16,20.700001,21.299999,20.049999,20.639999,20.639999,2621300.0
13,2010-07-16,20.700001,21.299999,20.049999,20.639999,20.639999,2621300.0
14,2010-07-19,21.370001,22.25,20.92,21.91,21.91,2486500.0
15,2010-07-19,21.370001,22.25,20.92,21.91,21.91,2486500.0
