In [6]:
class EDA:
    def __init__(self, filename):
        """
        constructor
        params: 
        filename : this is the .csv file we want to work with
        """
        self.df = pd.read_csv(f"{filename}", low_memory=False)
        self.missing = None
    
    def value_counts(self, attr):
        """
        returns the count of various entries inside the colname
        params: 
        attr : column for which you want to run the pandas .value_counts()
        """
        return self.df[f"{attr}"].value_counts()
    
    def rem_percent_missing(self, percent):
        """
        remove the specified percentage of null values from the dataframe
        params:
        percent: int or float 
        """
        col_list = self.df.columns[((self.df.isnull().sum()/len(self.df))*100 < percent)].to_list()
        self.df = self.df[col_list]
        return self.df
        
    def percent_missing(self):
        """
        report the percent missing from all the attributes
        """
        self.missing = ((self.df.isnull().sum()/len(self.df))*100)[((self.df.isnull().sum()/len(self.df))*100) > 0]
        return self.missing
    
    def check_missing_vals(self, attr_name):
        """
        find the missing value percent for an attribute
        """
        missing_attrs = self.percent_missing()
        if attr_name in missing_attrs:
            return f"{missing_attrs[attr_name]}"
        else:
            return "Missing values NOT found"
    
    def update_df(self, new_df):
        """
        Could be used if we have made a lot of changes in the 
        """
        self.df = new_df
        
    # attribute visualization and analysis below: 
    
    def addr_state(self):
        print(self.check_missing_vals('addr_state'))
        print(self.df['addr_state'].value_counts())
    
 

    def annual_inc(self):
        self.check_missing_vals('annual_inc')
        original_length = len(self.df)
        proposed_length = len(self.df[self.df['annual_inc'] <= 250000])
        print("We can drop off the columns with annual_inc above 250000 as these are mostly outliers. Number of rows above 250000", {original_length-proposed_length}, "\n")
        print("Percentage of Customers with annual income more than 250000: ",13447*100/len(self.df)) 
        sns.displot(data=self.df, x='annual_inc', hue='loan_status', bins=80, height=5, kde=True);
    
    
    def earliest_cr_line(self):
        print(self.check_missing_vals('earliest_cr_line'), "\n")
        print("Dropping the month part and only keeping the year \n")
        self.df['earliest_cr_line'] = self.df['earliest_cr_line'].apply(lambda date: int(date[-4:]))
        print(self.df.groupby('loan_status')['earliest_cr_line'].describe())
    
    def emp_length(self):
        print(self.check_missing_vals('emp_length'),"\n")
        print(self.df['emp_length'].unique(), "\n")
        plt.figure(figsize=(16,8))
        emp_length_order = [ '< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']
        print(sns.countplot(x='emp_length', data=self.df, order=emp_length_order, hue='loan_status'))
        charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("emp_length").count()['loan_status']
        fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("emp_length").count()['loan_status']
        percent_chargedoff = (charged_off * 100)/(charged_off + fully_paid)
        plt.figure(figsize=(12,4), dpi=130)
        percent_chargedoff.plot(kind='bar')
        plt.title("percent charged off")

    def emp_title(self):
        print("Percentage of missing values: ", self.check_missing_vals('emp_title'),"\n")
        print(self.df['emp_title'].describe())
    
    def grade_subgrade(self):
        print("sub_grade missing values: ", self.check_missing_vals('sub_grade'))
        print("Grade missing values: ", self.check_missing_vals('grade'))
        plt.figure(figsize=(16,4))
        subgrade_order = sorted(self.df['sub_grade'].unique())
        print(sns.countplot(x='sub_grade',data=self.df,order = subgrade_order, hue='loan_status'))

    
    def fico_high_low(self):
            print("fico_range_high missing values: ", self.check_missing_vals('fico_range_high'), "\n")
            print("fico_range_low missing values: ", self.check_missing_vals('fico_range_low'), "\n")
            print(self.df['fico_range_high'].describe(), "\n")
            print(self.df['fico_range_low'].describe(), "\n")
            self.df['fico'] = (self.df['fico_range_high'] + self.df['fico_range_low']) / 2
            self.df = self.df.drop(['fico_range_high', 'fico_range_low'], axis=1)
            plt.figure(figsize=(10,5))
            print(sns.boxplot(data=self.df, y='loan_status', x='fico'))
            print(self.df.groupby('loan_status')['fico'].describe())
        
    def home_ownership(self):
            print("Missing vals: ", self.check_missing_vals('home_ownership'))
            print("Value Counts: ",self.df['home_ownership'].value_counts(), "\n")
            print("\n merging the two('ANY', 'OTHER') together into one('Both')\n")
            self.df['home_ownership'] = self.df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')
            
            print(sns.countplot(x='home_ownership',data=self.df,hue='loan_status'), "\n")
            charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("home_ownership").count()['loan_status']
            fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("home_ownership").count()['loan_status']
            percentage_charged_off = (charged_off * 100)/(charged_off + fully_paid)
            percentage_fully_paid = (fully_paid * 100)/(charged_off + fully_paid)
            plt.figure(figsize=(12,4))
            percentage_charged_off.plot(kind='bar')
            plt.title("Percentage charged off vs home_ownership category");
            plt.show();
            plt.figure(figsize=(12,4))
            percentage_fully_paid.plot(kind='bar')
            plt.title("Percentage fully paid vs home_ownership category");
            plt.show();
    
    def application_type(self):
            # application type
            print("missing vals: ", self.check_missing_vals('application_type'), "\n")
            self.df['application_type'].value_counts()
            #percentage fully paid and charged off
            charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("application_type").count()['loan_status']
            fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("application_type").count()['loan_status']
            percentage_charged_off = (charged_off * 100)/(charged_off + fully_paid)
            percentage_fully_paid = (fully_paid * 100)/(charged_off + fully_paid)
            percentage_charged_off.plot(kind='bar')
            plt.title("Percentage charged off vs application_type");
            plt.show()
            percentage_fully_paid.plot(kind='bar')
            plt.title("Percentage fully paid vs home_ownership");
            plt.show()
            
    def initial_list(self):
        self.check_missing_vals('initial_list_status')
        self.df['initial_list_status'].value_counts()
        charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("initial_list_status").count()['loan_status']
        fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("initial_list_status").count()['loan_status']
        percentage_charged_off = (charged_off * 100)/(charged_off + fully_paid)
        percentage_fully_paid = (fully_paid * 100)/(charged_off + fully_paid)
        percentage_charged_off.plot(kind='bar')
        plt.title("Percentage charged off vs initial_list_status");
        plt.show()
        percentage_fully_paid.plot(kind='bar')
        plt.title("Percentage fully paid vs initial_list_status");
        plt.show()

        
    def int_rate(self):
        print("Missing vals: ", self.check_missing_vals('int_rate'), "\n")
        print(self.df.groupby('loan_status')['int_rate'].describe(), "\n")
        sns.boxplot(data=self.df, y='loan_status', x='int_rate');
        
    def loan_amnt(self):
        print("Missing vals: ",self.check_missing_vals('loan_amnt'))
        self.df.groupby('loan_status')['loan_amnt'].describe()
        sns.boxplot(data=self.df, y='loan_status', x='loan_amnt');

    def bnkcrd_acts(self):
        print("Missing vals: ", self.check_missing_vals('num_actv_bc_tl'), "\n")
        print(self.df['num_actv_bc_tl'].describe(), "\n")
        self.df['num_actv_bc_tl'] = self.df['num_actv_bc_tl'].fillna(4)
        print(self.df.groupby('loan_status')['num_actv_bc_tl'].describe(), "\n")
        self.df = self.df[self.df['num_actv_bc_tl'] < 10]
        print(self.df.groupby('loan_status')['num_actv_bc_tl'].describe(), "\n")
        print(sns.boxplot(data=self.df, y='loan_status', x='num_actv_bc_tl'), "\n")
    
    def mort_acc(self):
        print("Missing vals: ", self.check_missing_vals('mort_acc'), "\n")
        print(self.df['mort_acc'].describe(), "\n")
        print("Replacing with the mean value")
        self.df['mort_acc'] = self.df['mort_acc'].fillna(self.df['mort_acc'].mean())
        print(self.df.groupby('loan_status')['mort_acc'].describe(), "\n")
        self.df = self.df[self.df['mort_acc'] < 8]
        print(sns.boxplot(data=self.df, y='loan_status', x='mort_acc'))

    def total_acc(self):
        print("Missing vals: ", self.check_missing_vals('total_acc'), "\n")
        plt.figure(figsize=(24,4))
        plt.xticks(rotation=90)
        sns.countplot(data=self.df, x='total_acc');
        plt.show();
        self.df = self.df[self.df['total_acc'] < 60]
        print(self.df.groupby('loan_status')['total_acc'].describe())
        plt.figure(figsize=(24,4))
        plt.xticks(rotation=90)
        sns.countplot(data=self.df, x='total_acc', hue='loan_status');
        
    def tot_cur_bal(self):
        print("Missing vals: ",self.check_missing_vals('tot_cur_bal'), "\n")
        print(self.df['tot_cur_bal'].describe(), "\n")
        print("Filling the NULL values ", "\n")
        self.df['tot_cur_bal'] = self.df['tot_cur_bal'].fillna(self.df['tot_cur_bal'].describe()['mean'])
        plt.figure(figsize=(22,8))
        sns.histplot(data=self.df, x='tot_cur_bal', bins=400);
        plt.show();
        self.df = self.df[self.df['tot_cur_bal'] < 1000000] #outliers
        print(self.df.groupby('loan_status')['tot_cur_bal'].describe())
        
    def open_acc(self):
        print("Missing vals: ",self.check_missing_vals('open_acc'), "\n")
        plt.figure(figsize=(24,6))
        sns.countplot(data=self.df, x='open_acc', hue='loan_status');
        print(self.df.groupby('loan_status')['open_acc'].describe())
    
    def pub_rec(self):
        print("Missing vals: ",self.check_missing_vals('pub_rec'), "\n")
        plt.figure(figsize=(24,6))
        sns.countplot(data=self.df, x='pub_rec');
        plt.show();
        self.df = self.df[self.df['pub_rec'] < 3]
        print(self.df.groupby('loan_status')['pub_rec'].describe())
  
    def purpose(self):
        print("Missing vals: ",self.check_missing_vals('purpose'), "\n")
        plt.figure(figsize=(14,6))
        sns.countplot(data=self.df,x='purpose', hue='loan_status');
        plt.xticks(rotation=90);
        plt.show()
        plt.figure(figsize=(14,6))
        charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("purpose").count()['loan_status']
        fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("purpose").count()['loan_status']
        percentage_charged_off = (charged_off * 100)/(charged_off + fully_paid)
        percentage_charged_off.plot(kind='bar', cmap='viridis')
        plt.title("Percentage charged off per purpose category");
    
    def term(self):
        print("Missing vals: ",self.check_missing_vals('term'), "\n")
        sns.countplot(data=self.df, x='term')
        plt.show();
        charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("term").count()['loan_status']
        fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("term").count()['loan_status']
        percentage_charged_off = (charged_off * 100)/(charged_off + fully_paid)
        percentage_charged_off.plot(kind='bar')
        plt.title("Percentage charged off vs term category");
        
    def title(self):
        print(self.check_missing_vals('title'))
        print("unique values: ", len(self.df['title'].unique()))
        
    def revol_bal(self):
        print("Missing vals: ",self.check_missing_vals('revol_val'))
        sns.boxplot(data=self.df, y='loan_status', x='revol_bal')
        len(self.df[self.df['revol_bal'] > 95000])
        print("check what percentage is that")
        print(len(self.df[self.df['revol_bal'] > 95000]) * 100 / len(self.df))
        print(self.df.groupby('loan_status')['revol_bal'].describe())
        
        
    def revol_util(self):
        print("Missing vals: ",self.check_missing_vals('revol_util'), "\n")
        self.df['revol_util'] = self.df['revol_util'].fillna(self.df['revol_util'].mean())
        plt.figure(figsize=(24,6))
        sns.histplot(data=self.df, x='revol_util', bins=88);
        plt.show();
        sns.boxplot(data=self.df, y='loan_status', x='revol_util')
        plt.show();
        print(self.df.groupby('loan_status')['revol_util'].describe())
    
    def verification_status(self):
        print("Missing vals: ",self.check_missing_vals('verification_status'), "\n")
        print(self.df['verification_status'].value_counts())
        fig, axs = plt.subplots()
        sns.countplot(data=self.df, x='verification_status', hue='loan_status');
        plt.xticks(rotation=90)
        plt.show();
        charged_off = self.df[self.df['loan_status']=="Charged Off"].groupby("verification_status").count()['loan_status']
        fully_paid = self.df[self.df['loan_status']=="Fully Paid"].groupby("verification_status").count()['loan_status']
        percentage_charged_off = (charged_off * 100)/(charged_off + fully_paid)
        percentage_charged_off.plot(kind='bar')
        plt.title("Percentage charged off per verification_status category");
        plt.xticks(rotation=90);
        plt.show();
