# Pandas - Text Methods for String Data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
email = "kshitizit22@gmail.com"

In [3]:
email.split('@')

['kshitizit22', 'gmail.com']

In [4]:
names = pd.Series(['Andrew', 'Bobo', 'Claire', 'David', '5'])

In [5]:
names

0    Andrew
1      Bobo
2    Claire
3     David
4         5
dtype: object

In [6]:
names.str.upper()

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [7]:
names.str.capitalize()

0    Andrew
1      Bobo
2    Claire
3     David
4         5
dtype: object

In [8]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [9]:
names.str.isalpha()

0     True
1     True
2     True
3     True
4    False
dtype: bool

In [10]:
tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']

In [11]:
len(tech_finance)

2

In [12]:
tickers = pd.Series(tech_finance)

In [13]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [14]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [15]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [16]:
tickers.str.split(',', expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [17]:
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])

In [18]:
messy_names

0      andrew  
1         bo;bo
2      claire  
dtype: object

In [19]:
messy_names.str.replace(";",'').str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

In [20]:
def custom_quality(name):
    name = name.replace(';','')
    name = name.strip()
    name = name.capitalize()
    return name

In [21]:
messy_names.apply(custom_quality)

0    Andrew
1      Bobo
2    Claire
dtype: object

In [22]:
np.vectorize(custom_quality)(messy_names)

array(['Andrew', 'Bobo', 'Claire'], dtype='<U6')

<br><br>
## <b>Pandas Time Methods for Date & Time Data<b>

In [23]:
my_year = 2017
my_month = 1
my_day = 2
my_hour = 13
my_minute = 30
my_second = 15

In [24]:
my_date = datetime(my_year,my_month,my_day)

In [25]:
my_date

datetime.datetime(2017, 1, 2, 0, 0)

In [26]:
my_date_time = datetime(my_year,my_month,my_day,my_hour,my_minute,my_second)

In [27]:
my_date_time

datetime.datetime(2017, 1, 2, 13, 30, 15)

In [28]:
my_date.day

2

In [29]:
my_date_time.minute

30

##### Using Pandas

In [30]:
myser = pd.Series(['November 3, 2000', '2000-01-01', None])

In [31]:
myser

0    November 3, 2000
1          2000-01-01
2                None
dtype: object

In [32]:
pd.to_datetime(myser)

0   2000-11-03
1   2000-01-01
2          NaT
dtype: datetime64[ns]

In [33]:
pd.to_datetime(myser)[0]

Timestamp('2000-11-03 00:00:00')

In [34]:
obvi_euro_date = '31-12-2000'

In [35]:
pd.to_datetime(obvi_euro_date) 

Timestamp('2000-12-31 00:00:00')

In [36]:
euro_date = '18-12-2000'

In [37]:
pd.to_datetime(euro_date) 

Timestamp('2000-12-18 00:00:00')

In [38]:
pd.to_datetime(euro_date,dayfirst=True) 

Timestamp('2000-12-18 00:00:00')

In [39]:
style_date = '12--Dec--2000'

In [40]:
pd.to_datetime(style_date, format='%d--%b--%Y')

Timestamp('2000-12-12 00:00:00')

In [41]:
strange_date = '12th of Dec 2000'

In [42]:
pd.to_datetime(strange_date)

Timestamp('2000-12-12 00:00:00')

In [43]:
sales = pd.read_csv('RetailSales_BeerWineLiquor.csv')

In [44]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [45]:
sales.iloc[0]['DATE']

'1992-01-01'

In [46]:
type(sales.iloc[0]['DATE'])

str

In [47]:
sales['DATE'] = pd.to_datetime(sales['DATE'])  # Converts in a TIMESTAMP

In [48]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           340 non-null    datetime64[ns]
 1   MRTSSM4453USN  340 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 5.4 KB


In [49]:
sales.iloc[0]['DATE']

Timestamp('1992-01-01 00:00:00')

In [50]:
sales = pd.read_csv('RetailSales_BeerWineLiquor.csv',parse_dates=[0])

In [51]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [52]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   DATE           340 non-null    datetime64[ns]
 1   MRTSSM4453USN  340 non-null    int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 5.4 KB


##### Resampling

In [53]:
sales.index

RangeIndex(start=0, stop=340, step=1)

In [54]:
sales = sales.set_index("DATE")

In [55]:
sales.resample(rule='A')

<pandas.core.resample.DatetimeIndexResampler object at 0x00000210442463D0>

In [56]:
sales.resample(rule='A').mean()

Unnamed: 0_level_0,MRTSSM4453USN
DATE,Unnamed: 1_level_1
1992-12-31,1807.25
1993-12-31,1794.833333
1994-12-31,1841.75
1995-12-31,1833.916667
1996-12-31,1929.75
1997-12-31,2006.75
1998-12-31,2115.166667
1999-12-31,2206.333333
2000-12-31,2375.583333
2001-12-31,2468.416667


When calling `.resample()` you first need to pass in a **rule** parameter, then you need to call some sort of aggregation function.

The **rule** parameter describes the frequency with which to apply the aggregation function (daily, monthly, yearly, etc.)<br>
It is passed in using an "offset alias" - refer to the table below. [[reference](http://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases)]

The aggregation function is needed because, due to resampling, we need some sort of mathematical rule to join the rows (mean, sum, count, etc.)

<table style="display: inline-block">
    <caption style="text-align: center"><strong>TIME SERIES OFFSET ALIASES</strong></caption>
<tr><th>ALIAS</th><th>DESCRIPTION</th></tr>
<tr><td>B</td><td>business day frequency</td></tr>
<tr><td>C</td><td>custom business day frequency (experimental)</td></tr>
<tr><td>D</td><td>calendar day frequency</td></tr>
<tr><td>W</td><td>weekly frequency</td></tr>
<tr><td>M</td><td>month end frequency</td></tr>
<tr><td>SM</td><td>semi-month end frequency (15th and end of month)</td></tr>
<tr><td>BM</td><td>business month end frequency</td></tr>
<tr><td>CBM</td><td>custom business month end frequency</td></tr>
<tr><td>MS</td><td>month start frequency</td></tr>
<tr><td>SMS</td><td>semi-month start frequency (1st and 15th)</td></tr>
<tr><td>BMS</td><td>business month start frequency</td></tr>
<tr><td>CBMS</td><td>custom business month start frequency</td></tr>
<tr><td>Q</td><td>quarter end frequency</td></tr>
<tr><td></td><td><font color=white>intentionally left blank</font></td></tr></table>

<table style="display: inline-block; margin-left: 40px">
<caption style="text-align: center"></caption>
<tr><th>ALIAS</th><th>DESCRIPTION</th></tr>
<tr><td>BQ</td><td>business quarter endfrequency</td></tr>
<tr><td>QS</td><td>quarter start frequency</td></tr>
<tr><td>BQS</td><td>business quarter start frequency</td></tr>
<tr><td>A</td><td>year end frequency</td></tr>
<tr><td>BA</td><td>business year end frequency</td></tr>
<tr><td>AS</td><td>year start frequency</td></tr>
<tr><td>BAS</td><td>business year start frequency</td></tr>
<tr><td>BH</td><td>business hour frequency</td></tr>
<tr><td>H</td><td>hourly frequency</td></tr>
<tr><td>T, min</td><td>minutely frequency</td></tr>
<tr><td>S</td><td>secondly frequency</td></tr>
<tr><td>L, ms</td><td>milliseconds</td></tr>
<tr><td>U, us</td><td>microseconds</td></tr>
<tr><td>N</td><td>nanoseconds</td></tr></table>

In [57]:
sales = sales.reset_index()

In [58]:
sales

Unnamed: 0,DATE,MRTSSM4453USN
0,1992-01-01,1509
1,1992-02-01,1541
2,1992-03-01,1597
3,1992-04-01,1675
4,1992-05-01,1822
...,...,...
335,2019-12-01,6630
336,2020-01-01,4388
337,2020-02-01,4533
338,2020-03-01,5562


In [59]:
sales['DATE'].dt.is_leap_year

0       True
1       True
2       True
3       True
4       True
       ...  
335    False
336     True
337     True
338     True
339     True
Name: DATE, Length: 340, dtype: bool

In [60]:
sales['DATE'].dt.month

0       1
1       2
2       3
3       4
4       5
       ..
335    12
336     1
337     2
338     3
339     4
Name: DATE, Length: 340, dtype: int64