The complete explanation for this notebook is at https://youranalystbuddy.com/text-data-in-pandas/
    
## Data types for text

In [57]:
import pandas as pd

students = pd.read_csv('students_standing.csv')
students.head(n=3)

Unnamed: 0,StudentID,FirstName,LastName,HSGPA,FYGPA,Standing
0,202005537,Eunice,Ehmann,2.47,2.42,average
1,202008560,Hobert,Schoenberger,2.27,2.05,average
2,202004948,Nicholas,Sizer,4.0,3.96,good


In [58]:
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   StudentID  200 non-null    int64  
 1   FirstName  200 non-null    object 
 2   LastName   200 non-null    object 
 3   HSGPA      200 non-null    float64
 4   FYGPA      200 non-null    float64
 5   Standing   200 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 9.5+ KB


In [59]:
to_string_cols = ['StudentID','FirstName','LastName','Standing']
students[to_string_cols] = students[to_string_cols].astype('string')
students.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   StudentID  200 non-null    string 
 1   FirstName  200 non-null    string 
 2   LastName   200 non-null    string 
 3   HSGPA      200 non-null    float64
 4   FYGPA      200 non-null    float64
 5   Standing   200 non-null    string 
dtypes: float64(2), string(4)
memory usage: 9.5 KB


### Basic operations on text data

In [60]:
students['FullName'] = students['FirstName'] + ' ' + students['LastName']
students.head(n=5)

Unnamed: 0,StudentID,FirstName,LastName,HSGPA,FYGPA,Standing,FullName
0,202005537,Eunice,Ehmann,2.47,2.42,average,Eunice Ehmann
1,202008560,Hobert,Schoenberger,2.27,2.05,average,Hobert Schoenberger
2,202004948,Nicholas,Sizer,4.0,3.96,good,Nicholas Sizer
3,202001207,Elvin,Foulks,3.16,2.64,average,Elvin Foulks
4,202000260,Bruno,Viney,3.82,3.99,good,Bruno Viney


### The str property

In [49]:
students['FirstName'].str[:4]

0      Eunice
1      Hobert
2    Nicholas
3       Elvin
Name: FirstName, dtype: string

In [62]:
students['Email'] = students['FirstName'].str[0] + students['LastName'].str[:5] \
    + students['StudentID'].str[-5:] + '@work.com'
students.head(3)

Unnamed: 0,StudentID,FirstName,LastName,HSGPA,FYGPA,Standing,FullName,Email
0,202005537,Eunice,Ehmann,2.47,2.42,average,Eunice Ehmann,EEhman05537@work.com
1,202008560,Hobert,Schoenberger,2.27,2.05,average,Hobert Schoenberger,HSchoe08560@work.com
2,202004948,Nicholas,Sizer,4.0,3.96,good,Nicholas Sizer,NSizer04948@work.com


In [63]:
students['FirstName'].str.upper()

0        EUNICE
1        HOBERT
2      NICHOLAS
3         ELVIN
4         BRUNO
         ...   
195       JUDGE
196     WINSTON
197    THADDEUS
198      WRIGHT
199      SIDNEY
Name: FirstName, Length: 200, dtype: string

In [65]:
students['Email'].str.lower()

0      eehman05537@work.com
1      hschoe08560@work.com
2      nsizer04948@work.com
3      efoulk01207@work.com
4      bviney00260@work.com
               ...         
195    jgrand00691@work.com
196    wkanek09695@work.com
197     tchen08725@work.com
198    wmarbu01120@work.com
199    ssienk09418@work.com
Name: Email, Length: 200, dtype: string

In [66]:
students['StudentID'].str.removeprefix('2020')

0      05537
1      08560
2      04948
3      01207
4      00260
       ...  
195    00691
196    09695
197    08725
198    01120
199    09418
Name: StudentID, Length: 200, dtype: string

In [67]:
students['Email'].str.removesuffix('.com')

0      EEhman05537@work
1      HSchoe08560@work
2      NSizer04948@work
3      EFoulk01207@work
4      BViney00260@work
             ...       
195    JGrand00691@work
196    WKanek09695@work
197     TChen08725@work
198    WMarbu01120@work
199    SSienk09418@work
Name: Email, Length: 200, dtype: string

In [68]:
students['FullName'].str.len()

0      13
1      19
2      14
3      12
4      11
       ..
195    17
196    14
197    13
198    16
199    18
Name: FullName, Length: 200, dtype: Int64

In [69]:
students['FullName'].str.contains('er')

0      False
1       True
2       True
3      False
4      False
       ...  
195    False
196    False
197    False
198     True
199    False
Name: FullName, Length: 200, dtype: boolean

### Split strings into columns

In [71]:
students['FullName'].str.split(' ', expand=True)

Unnamed: 0,0,1
0,Eunice,Ehmann
1,Hobert,Schoenberger
2,Nicholas,Sizer
3,Elvin,Foulks
4,Bruno,Viney
...,...,...
195,Judge,Grandinetti
196,Winston,Kaneko
197,Thaddeus,Chen
198,Wright,Marburger


### Simple search in text columns

In [70]:
students.loc[students['FullName'].str.contains('er'), :]

Unnamed: 0,StudentID,FirstName,LastName,HSGPA,FYGPA,Standing,FullName,Email
1,202008560,Hobert,Schoenberger,2.27,2.05,average,Hobert Schoenberger,HSchoe08560@work.com
2,202004948,Nicholas,Sizer,4.0,3.96,good,Nicholas Sizer,NSizer04948@work.com
7,202005622,Leroy,Pasha,2.22,2.2,average,Leroy Pasha,LPasha05622@work.com
13,202005224,Ferdinand,Geil,2.67,2.78,average,Ferdinand Geil,FGeil05224@work.com
23,202005164,Vernie,Teneyck,2.91,3.41,good,Vernie Teneyck,VTeney05164@work.com
29,202001489,Everette,Ra,2.81,2.02,average,Everette Ra,ERa01489@work.com
33,202006689,Wyatt,Vandervoort,2.56,1.69,poor,Wyatt Vandervoort,WVande06689@work.com
42,202005840,Spencer,Fielding,2.4,2.08,average,Spencer Fielding,SField05840@work.com
49,202001447,Perry,Jahn,3.31,3.39,good,Perry Jahn,PJahn01447@work.com
55,202008194,Elton,Bjerke,2.52,2.34,average,Elton Bjerke,EBjerk08194@work.com


In [72]:
students['FullName'].str.replace('er','or')

0            Eunice Ehmann
1      Hobort Schoenborgor
2           Nicholas Sizor
3             Elvin Foulks
4              Bruno Viney
              ...         
195      Judge Grandinetti
196         Winston Kaneko
197          Thaddeus Chen
198       Wright Marburgor
199     Sidney Sienkiewicz
Name: FullName, Length: 200, dtype: string