# Drop multiple columns

In [1]:
# conventional way to import pandas
import pandas as pd

In [2]:
# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame
df = pd.read_table('http://bit.ly/chiporders')
df

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",$10.98
6,3,1,Side of Chips,,$1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",$11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",$9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",$9.25


In [7]:
def drop_multiple_col(col_names_list, df): 
    df.drop(col_names_list, axis=1, inplace=True)
    return df

In [14]:
col_names = ['choice_description']
drop_multiple_col(col_names,df)
df

Unnamed: 0,order_id,item_name,item_price
0,1,Chips and Fresh Tomato Salsa,$2.39
1,1,Izze,$3.39
2,1,Nantucket Nectar,$3.39
3,1,Chips and Tomatillo-Green Chili Salsa,$2.39
4,2,Chicken Bowl,$16.98
5,3,Chicken Bowl,$10.98
6,3,Side of Chips,$1.69
7,4,Steak Burrito,$11.75
8,4,Steak Soft Tacos,$9.25
9,5,Steak Burrito,$9.25


# Change dtypes

In [17]:
def change_dtypes(col_int, col_float, df): 
    df[col_int] = df[col_int].astype('int32')
    df[col_float] = df[col_float].astype('float32')
    return df

In [19]:
df.dtypes
change_dtypes("order_id","order_id",df)
df

Unnamed: 0,order_id,item_name,item_price
0,1.0,Chips and Fresh Tomato Salsa,$2.39
1,1.0,Izze,$3.39
2,1.0,Nantucket Nectar,$3.39
3,1.0,Chips and Tomatillo-Green Chili Salsa,$2.39
4,2.0,Chicken Bowl,$16.98
5,3.0,Chicken Bowl,$10.98
6,3.0,Side of Chips,$1.69
7,4.0,Steak Burrito,$11.75
8,4.0,Steak Soft Tacos,$9.25
9,5.0,Steak Burrito,$9.25


# Convert categorical variable to numerical variable

In [21]:
def convert_cat2num(df):
    # Convert categorical variable to numerical variable
    num_encode = {'item_name' : {'Chicken Salad Bowl':5, 'Steak Burrito':4, 'Chips':3, 'Izze':2, 'Side of Chips':1 }}  
    df.replace(num_encode, inplace=True)
    return df

In [22]:
convert_cat2num(df)

Unnamed: 0,order_id,item_name,item_price
0,1.0,Chips and Fresh Tomato Salsa,$2.39
1,1.0,2,$3.39
2,1.0,Nantucket Nectar,$3.39
3,1.0,Chips and Tomatillo-Green Chili Salsa,$2.39
4,2.0,Chicken Bowl,$16.98
5,3.0,Chicken Bowl,$10.98
6,3.0,1,$1.69
7,4.0,4,$11.75
8,4.0,Steak Soft Tacos,$9.25
9,5.0,4,$9.25


#  Check missing data

In [23]:
def check_missing_data(df):
    # check for any missing data in the df (display in descending order)
    return df.isnull().sum().sort_values(ascending=False)

In [24]:
check_missing_data(df)

item_price    0
item_name     0
order_id      0
dtype: int64

# Remove strings in columns

In [29]:
def remove_col_str(df):
    # remove a portion of string in a dataframe column - col_1
    df['item_name'].replace('\n', '', regex=True, inplace=True)
    
    # remove all the characters after &# (including &#) for column - col_1
    df['item_name'].replace(' &#.*', '', regex=True, inplace=True)
    return df

In [30]:
remove_col_str(df)

Unnamed: 0,order_id,item_name,item_price
0,1.0,Chips and Fresh Tomato Salsa,$2.39
1,1.0,2,$3.39
2,1.0,Nantucket Nectar,$3.39
3,1.0,Chips and Tomatillo-Green Chili Salsa,$2.39
4,2.0,Chicken Bowl,$16.98
5,3.0,Chicken Bowl,$10.98
6,3.0,1,$1.69
7,4.0,4,$11.75
8,4.0,Steak Soft Tacos,$9.25
9,5.0,4,$9.25


# Remove white space in columns

In [31]:

def remove_col_white_space(df,col):
    # remove white space at the beginning of string 
    df[col] = df[col].str.lstrip()
    return df

In [32]:
remove_col_white_space(df,'item_name')

Unnamed: 0,order_id,item_name,item_price
0,1.0,Chips and Fresh Tomato Salsa,$2.39
1,1.0,,$3.39
2,1.0,Nantucket Nectar,$3.39
3,1.0,Chips and Tomatillo-Green Chili Salsa,$2.39
4,2.0,Chicken Bowl,$16.98
5,3.0,Chicken Bowl,$10.98
6,3.0,,$1.69
7,4.0,,$11.75
8,4.0,Steak Soft Tacos,$9.25
9,5.0,,$9.25


# Concatenate two columns with strings (with condition)

In [39]:
def concat_col_str_condition(df):
    # concat 2 columns with strings if the last 3 letters of the first column are 'pil'
    mask = df['item_name'].str.endswith('ink', na=False)
    col_new = df[mask]['item_name'] + df[mask]['item_price']
    col_new.replace('ink', ' ', regex=True, inplace=True)  # replace the 'pil' with emtpy space
    return col_new

In [40]:
concat_col_str_condition(df)

263     Canned Soft Dr $1.25 
292     Canned Soft Dr $1.25 
298     6 Pack Soft Dr $6.49 
320     Canned Soft Dr $1.25 
337     Canned Soft Dr $1.25 
341     6 Pack Soft Dr $6.49 
346     Canned Soft Dr $1.25 
347     Canned Soft Dr $1.25 
357     6 Pack Soft Dr $6.49 
368     Canned Soft Dr $1.25 
380     Canned Soft Dr $1.25 
381     Canned Soft Dr $1.25 
388     6 Pack Soft Dr $6.49 
393     Canned Soft Dr $1.25 
401     Canned Soft Dr $1.25 
403     Canned Soft Dr $1.25 
410     Canned Soft Dr $1.25 
417     6 Pack Soft Dr $6.49 
459     Canned Soft Dr $1.25 
492     Canned Soft Dr $2.50 
505     Canned Soft Dr $1.25 
507     Canned Soft Dr $1.25 
513     Canned Soft Dr $1.25 
541     Canned Soft Dr $1.25 
553     6 Pack Soft Dr $6.49 
566     Canned Soft Dr $1.25 
567     Canned Soft Dr $1.25 
610     Canned Soft Dr $1.25 
618     Canned Soft Dr $1.25 
635     Canned Soft Dr $1.25 
                ...          
4422    Canned Soft Dr $1.25 
4430    Canned Soft Dr $1.25 
4462    Ca

# Convert timestamp(from string to datetime format)

In [None]:
def convert_str_datetime(df): 
    df.insert(loc=2, column='timestamp', value=pd.to_datetime(df.transdate, format='%Y-%m-%d %H:%M:%S.%f')) 
    return df