# Preliminaries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('./rawdata.xlsx')
df

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construct,2
5,2,12/31/2009,0.34,Construction,2


In [3]:
df.dtypes

firmid        int64
date         object
return      float64
industry     object
ind_code     object
dtype: object

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   firmid    6 non-null      int64  
 1   date      5 non-null      object 
 2   return    5 non-null      float64
 3   industry  5 non-null      object 
 4   ind_code  6 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 372.0+ bytes


The "object" data type (for the ``industry`` and ``ind_code`` columns) is a catch-all term for when Pandas can not determine the exact data type of that column (e.g. int, float, str, etc). Many times, columns containing strings will have this data type.

In [5]:
df.describe()

Unnamed: 0,firmid,return
count,6.0,5.0
mean,1.833333,4.942
std,0.752773,10.099018
min,1.0,0.05
25%,1.25,0.34
50%,2.0,0.45
75%,2.0,0.87
max,3.0,23.0


# Missing values

Missing values appear as a special code depending on the datatype of the column in which they appear: ``NaN`` (which stands for "not a number") for numeric data types, ``None`` or ``NaN`` for object data type, ``NaT`` for "datetime" columns (more on this data type later). 

To find the missing values in the data, we can use the ``.isnull`` (or its equivalent: ``.isna()``):

In [6]:
df['return'].isnull()

0    False
1    False
2    False
3    False
4     True
5    False
Name: return, dtype: bool

In [7]:
df.loc[df['return'].isnull(),:]

Unnamed: 0,firmid,date,return,industry,ind_code
4,2,12/31/2009,,Construct,2


We can drop all the rows that have any missing values using the ``.dropna()`` function:

In [8]:
df2 = df.dropna()
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
3,1,12/31/2009,0.87,Finance,1
5,2,12/31/2009,0.34,Construction,2


If we want to drop the rows that have **only** missing values, we have the use ``how = 'all'`` as a parameter:

In [9]:
df2 = df.dropna(how='all')
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construct,2
5,2,12/31/2009,0.34,Construction,2


If we want to remove all raws that contain missing values in a given column, we have to use ``.loc[]`` and the ``.notnull()`` function:

In [10]:
df2 = df.loc[df['return'].notnull(), :]
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
5,2,12/31/2009,0.34,Construction,2


or we can use the ``subset`` parameter of the ``dropna`` function, which tells the function to look for missing values only in a subset of the columns: 

In [11]:
df2 = df.dropna(subset=['return'])
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
5,2,12/31/2009,0.34,Construction,2


# Changing data types

Many times, a particular column in our dataframe does not have the datatype we want. There are several functions that allow us to convert one datatype to another. Below, we cover the most commonly used ones:

## ``.astype()``

Specify the new datatype that you want to convert to as an argument to ``.astype()``:

In [12]:
df2 = df.copy()
df2.dtypes

firmid        int64
date         object
return      float64
industry     object
ind_code     object
dtype: object

In [13]:
df2['firmid'] = df2['firmid'].astype('float64')
df2.dtypes

firmid      float64
date         object
return      float64
industry     object
ind_code     object
dtype: object

In [14]:
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1.0,12/31/2008,0.05,Finance,1
1,2.0,,0.45,Construction,2
2,3.0,12/31/2008,23.0,,M
3,1.0,12/31/2009,0.87,Finance,1
4,2.0,12/31/2009,,Construct,2
5,2.0,12/31/2009,0.34,Construction,2


In [15]:
df2['firmid'] = df2['firmid'].astype('string')
df2.dtypes

firmid      string[python]
date                object
return             float64
industry            object
ind_code            object
dtype: object

In [16]:
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1.0,12/31/2008,0.05,Finance,1
1,2.0,,0.45,Construction,2
2,3.0,12/31/2008,23.0,,M
3,1.0,12/31/2009,0.87,Finance,1
4,2.0,12/31/2009,,Construct,2
5,2.0,12/31/2009,0.34,Construction,2


It may not look like ``firmid`` is a string data type now, but it is. For example, the below command would not work if ``firmid`` was still numeric:

In [17]:
df2['newid'] = df2['firmid'] + "abc"
df2

Unnamed: 0,firmid,date,return,industry,ind_code,newid
0,1.0,12/31/2008,0.05,Finance,1,1.0abc
1,2.0,,0.45,Construction,2,2.0abc
2,3.0,12/31/2008,23.0,,M,3.0abc
3,1.0,12/31/2009,0.87,Finance,1,1.0abc
4,2.0,12/31/2009,,Construct,2,2.0abc
5,2.0,12/31/2009,0.34,Construction,2,2.0abc


## ``.to_numeric()``

This is commonly used to convert string (or object) data types to a numeric data type. Unlike ``.astype()`` which can be applied after the name of the dataframe we want to convert, with ``.to_numeric()``, you have to supply that dataframe as an argument:

In [18]:
df2.dtypes

firmid      string[python]
date                object
return             float64
industry            object
ind_code            object
newid       string[python]
dtype: object

In [19]:
df2['firmid'] = pd.to_numeric(df2['firmid'])
df2.dtypes

firmid             Float64
date                object
return             float64
industry            object
ind_code            object
newid       string[python]
dtype: object

In some situations, the ``.to_numeric()`` function will not be successful unless you specify the parameter ``errors = `coerce'``. For example, the code below would not work without that parameter (which is why I always specify it):

In [20]:
df2['ind_code'] = pd.to_numeric(df2['ind_code'], errors='coerce')
df2.dtypes

firmid             Float64
date                object
return             float64
industry            object
ind_code           float64
newid       string[python]
dtype: object

Note that this converted the non-numeric values in the ``ind_code`` column to ``NaN``:

In [21]:
df2

Unnamed: 0,firmid,date,return,industry,ind_code,newid
0,1.0,12/31/2008,0.05,Finance,1.0,1.0abc
1,2.0,,0.45,Construction,2.0,2.0abc
2,3.0,12/31/2008,23.0,,,3.0abc
3,1.0,12/31/2009,0.87,Finance,1.0,1.0abc
4,2.0,12/31/2009,,Construct,2.0,2.0abc
5,2.0,12/31/2009,0.34,Construction,2.0,2.0abc


# Duplicates and counts

In many situations, it is important to know if our data contains any duplicate entries (most of the time we want to eliminate those) as well as explicitly count duplicate entries in any particular column (or set of columns) in our data. We can perform these operations with the ``.duplicated()`` and ``.value_counts()`` functions:

## ``.duplicated()`` and ``.drop_duplicates()``

Syntax:
```python
DataFrame.duplicated(subset=None, keep='first')
```

where the ``subset`` parameter allows us to specifies where in the dataset (which columns) we are looking for duplicated rows (if unspecified, Pandas will look for instances where an entire row is duplicated). The ``keep`` parameter allows us to specify which of the duplicated rows to keep (if any).

In [22]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [23]:
df.duplicated(subset=['firmid','date'])

0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool

In [24]:
df.duplicated(subset=['firmid','date'], keep='last')

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool

In [25]:
df.duplicated(subset=['firmid','date'], keep=False)

0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

To drop duplicated data, we can use the ``.duplicated()`` function inside a ``.loc[]``:

In [26]:
df2 = df.loc[~df.duplicated(subset=['firmid','date'])]
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construct,2


or, more commonly, using the ``.drop_duplicates()`` function:

In [27]:
df2 = df.drop_duplicates(subset=['firmid','date'])
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construct,2


Note that the above still keeps the 4th row, and drops the 5th (a duplicate of the 4th). This is because ``keep='first'`` by default for the ``.drop_duplicates()`` function. To eliminate both duplicated rows, we would have to set ``keep=False``:

In [28]:
df2 = df.drop_duplicates(subset=['firmid','date'], keep=False)
df2

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1


Note also that the meaning of "first" and "last" for the ``keep`` parameter depends on how your dataframe happens to be sorted at the time you drop the duplicates.

## ``.value_counts()``

This finds all the unique values in a column and counts the number of times they appear in that column.

Syntax:
```python
DataFrame.value_counts(subset=None, normalize=False, sort=True, ascending=False, dropna=True)
```

In [29]:
df['industry'].value_counts()

industry
Finance         2
Construction    2
Construct       1
Name: count, dtype: int64

In [30]:
df.value_counts('industry')

industry
Construction    2
Finance         2
Construct       1
Name: count, dtype: int64

# Operating on text data (strings)

Working with text data is a huge topic in data analysis. The Pandas user guide offers a detailed discussion on the way the Pandas package can be used to operate on text data: https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#. For the most part, all of this is done with the ``.str`` subpackage and its methods. 

Here, we cover a very small subset of the functions that are commonly used for string manipulation inside a dataframe.

We'll work on the ``df`` dataframe:

In [31]:
df

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construct,2
5,2,12/31/2009,0.34,Construction,2


It is important to convert a text column to ``string`` type before we manipulate it with ``.str`` functions. For example, the ``industry`` column is currently of type ``object`` so we will convert it to ``string``:

In [32]:
df['industry'] = df['industry'].astype('string')
df

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construct,2
5,2,12/31/2009,0.34,Construction,2


## Slicing into string data

In [33]:
df['industry'].str[0:3]

0     Fin
1     Con
2    <NA>
3     Fin
4     Con
5     Con
Name: industry, dtype: string

## Converting to lower case or upper case

In [34]:
df['industry'].str.lower()

0         finance
1    construction
2            <NA>
3         finance
4       construct
5    construction
Name: industry, dtype: string

In [35]:
df['industry'].str.upper()

0         FINANCE
1    CONSTRUCTION
2            <NA>
3         FINANCE
4       CONSTRUCT
5    CONSTRUCTION
Name: industry, dtype: string

## Substrings

In [36]:
df.loc[df['industry'].str.contains("Cons"), :]

Unnamed: 0,firmid,date,return,industry,ind_code
1,2,,0.45,Construction,2
4,2,12/31/2009,,Construct,2
5,2,12/31/2009,0.34,Construction,2


In [37]:
df['industry'] = df['industry'].str.replace("Construct","Construction")
df

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Constructionion,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construction,2
5,2,12/31/2009,0.34,Constructionion,2


In [41]:
df.loc[df['industry'].str.contains("Construct"), 'industry'] = "Construction"
df

Unnamed: 0,firmid,date,return,industry,ind_code
0,1,12/31/2008,0.05,Finance,1
1,2,,0.45,Construction,2
2,3,12/31/2008,23.0,,M
3,1,12/31/2009,0.87,Finance,1
4,2,12/31/2009,,Construction,2
5,2,12/31/2009,0.34,Construction,2


## Splitting

In [44]:
df[['month','day','year']] = df['date'].str.split(pat='/', expand=True)
df

Unnamed: 0,firmid,date,return,industry,ind_code,month,day,year
0,1,12/31/2008,0.05,Finance,1,12.0,31.0,2008.0
1,2,,0.45,Construction,2,,,
2,3,12/31/2008,23.0,,M,12.0,31.0,2008.0
3,1,12/31/2009,0.87,Finance,1,12.0,31.0,2009.0
4,2,12/31/2009,,Construction,2,12.0,31.0,2009.0
5,2,12/31/2009,0.34,Construction,2,12.0,31.0,2009.0


## Stripping white spaces

In [45]:
newdf = pd.DataFrame(np.random.rand(3,2), columns=[' Column A ', " Column B "])
newdf

Unnamed: 0,Column A,Column B
0,0.877501,0.51143
1,0.250246,0.08214
2,0.541846,0.039013


In [47]:
#newdf['Column A']

In [49]:
newdf.columns = newdf.columns.str.strip()
newdf

Unnamed: 0,Column A,Column B
0,0.877501,0.51143
1,0.250246,0.08214
2,0.541846,0.039013


In [50]:
newdf['Column A']

0    0.877501
1    0.250246
2    0.541846
Name: Column A, dtype: float64

## Chaining ``.str`` methods

In [52]:
newdf.columns = (newdf.columns
                 .str.strip()
                 .str.replace(" ","_")
                 .str.lower())
newdf

Unnamed: 0,column_a,column_b
0,0.877501,0.51143
1,0.250246,0.08214
2,0.541846,0.039013
