In [2]:
import requests
import re
import os
from bs4 import BeautifulSoup
from urllib.request import urlopen
from zipfile import ZipFile
from io import BytesIO
import time
import datetime
import sys
from tqdm import tqdm
import pandas as pd
import numpy as np
import glob
import csv

In [3]:
def assure_path_exists(path):
    if not os.path.exists(path):
            os.makedirs(path)

In [4]:
def extracrtZip(s,monthlistdata,path):
    abc = tqdm(monthlistdata)
    for month in abc:
        abc.set_description("Downloading %s" % month)
        r = s.get(month)
        z = ZipFile(BytesIO(r.content)) 
        z.extractall(path)  

In [5]:
def get_files(foldername, start_yr,end_yr, web_post):
    with requests.Session() as s:
        preUrl = s.post(web_post['url'], data={'username': web_post['username'], 'password': web_post['password']})  # post the user name and password
        payload2={'accept': 'Yes','acceptSubmit':'Continue','action':'acceptTandC'}  # post the accept and continue
        finalUrl=s.post(web_post['posturl'],payload2)
        linkhtml =finalUrl.text 
        allzipfiles=BeautifulSoup(linkhtml, "html.parser")
        ziplist=allzipfiles.find_all('td')
        sampledata=[]
        historicaldata=[]
        count=0
        slist=[]
        for i in range(int(start_yr),int(end_yr)+1):
            #print(i)
            slist.append(i)
        for li in ziplist:
            zipatags=li.findAll('a')
            for zipa in zipatags:
                for yr in slist:
                    if str(yr) in zipa.text:
                        if re.match('sample',zipa.text):    #sample vs. historicl files
                            link = zipa.get('href')
                            Samplepath=str(os.getcwd())+"/"+foldername
                            assure_path_exists(Samplepath)
                            finallink ='https://freddiemac.embs.com/FLoan/Data/' + link
                            sampledata.append(finallink) 
        extracrtZip(s,sampledata,Samplepath)

In [6]:
def prep_orig_table(foldername):

    filename= "Origination_Input_All.csv"
    
    headers = ['fico','first_pmnt_dt','first_hb_flag','mat_dt','msa_cd',"pmi_pct",'unit_cnt','occ_type','cltv','dti','orig_upb','ltv','int_rt','channel','ppm_flag','prod_type','prop_state', 'prop_type','zip_code','loan_id','loan_purpose', 'orig_loan_term','bor_cnt','seller_name','servicer_name', 'sc_flag']
    types = {'fico': 'float',
             'first_pmnt_dt': 'int64',
             'first_hb_flag': 'char',
             'mat_dt': 'int64',
             'msa_cd': 'char',
             'pmi_pct': 'float',
             'unit_cnt': 'float',
             'occ_type': 'char',
             'cltv': 'float',
             'dti': 'float',
             'orig_upb': 'float',
             'ltv': 'float',
             'int_rt': 'float',
             'channel': 'char',
             'ppm_flag': 'char',
             'prod_type': 'char',
             'prop_state': 'char', 
             'prop_type': 'char',
             'zip_code': 'int64',
             'loan_id': 'char',
             'loan_purpose': 'char', 
             'orig_loan_term': 'float',
             'bor_cnt': 'int64',
             'seller_name': 'char',
             'servicer_name': 'char',
             'sc_flag': 'str'}
    
    fm_files = str(os.getcwd())+"/"+foldername+"/sample_orig_*.txt"

    abc = tqdm(glob.glob(fm_files))
    
    counter = 0
    with open(filename, 'w',encoding='utf-8',newline="") as file:
        for f in abc: 
            abc.set_description("Working on  %s" % f)
            sample_df = pd.read_csv(f ,sep="|", names=headers, skipinitialspace=True) 
            sample_df['vintage'] = np.where(sample_df.loan_id.str[2:4] == '99', '19'+ sample_df.loan_id.str[2:6], '20'+ sample_df.loan_id.str[2:6])
            if counter == 0:
                sample_df.to_csv(file, mode='a', header=True, index=False)
            else:
                sample_df.to_csv(file, mode='a', header=False, index=False)
            counter += 1

In [7]:
def prep_svcg_table(foldername): 
    
    filename= "Servicing_Input_All.csv"
    
    headers= ['loan_id','rprt_dt','cur_upb','dlq_stat','loan_age','rem_months', 'repo_flag','mod_flag', 'zero_bal_cd', 'zero_bal_dt','cur_int_rt','cur_def_upb','lp_due_dt','pmi_rec', 'net_sale_proc','non_pmi_rec','expenses', 'legal_cost', 'maint_cost','taxes_ins_cost','misc_cost','actual_loss', 'mod_cost', 'step_mod_flag', 'def_pmt_mod_flag', 'eltv']
    types=   {'loan_id': 'str',
              'rprt_dt': 'int64',  
              'cur_upb':'float',  
              'dlq_stat': 'str',     
              'loan_age':'float',   
              'rem_months': 'float',      
              'repo_flag':'str',      
              'mod_flag':'str',      
              'zero_bal_cd':'str',         
              'zero_bal_dt':'float',      
              'cur_int_rt':'float',     
              'cur_def_upb':'float',      
              'lp_due_dt':'float',    
              'net_sale_proc':'str',   
              'pmi_rec':'float',        
              'non_pmi_rec':'float',      
              'expenses':'float',    
              'legal_cost':'float',      
              'maint_cost':'float',     
              'taxes_ins_cost':'float',         
              'misc_cost':'float',    
              'actual_loss':'float',       
              'mod_cost':'float',    
              'step_mod_flag':'str',           
              'def_pmt_mod_flag':'str',
              'eltv': 'float'}  
    
    fm_files = str(os.getcwd())+"/"+foldername+"/sample_svcg_*.txt"
   
    abc = tqdm(glob.glob(fm_files))
    
    counter = 0
    with open(filename, 'w',encoding='utf-8',newline="") as file:
        for f in abc: 
            abc.set_description("Working on  %s" % f)
            perf_df = pd.read_csv(f ,sep="|", names=headers, dtype = types, skipinitialspace=True) 
            if counter == 0:
                perf_df.to_csv(file, mode='a', header=True,index=False)
            else: 
                perf_df.to_csv(file, mode='a', header=False,index=False)
            counter += 1

In [12]:
def main():
    web_post ={'url':'https://freddiemac.embs.com/FLoan/secure/auth.php',
               'posturl':'https://freddiemac.embs.com/FLoan/Data/download.php',
               'username':'kenneth.chen1@ey.com',
               'password':'z@2WTd\|'}
    
    start_yr = 2003
    end_yr = 2017
    foldername= 'FreddieMac_LoanLevel_Files'
    
#     get_files(foldername, start_yr, end_yr, web_post)
    prep_orig_table(foldername)
    prep_svcg_table(foldername)

In [None]:
if __name__ == '__main__':
    main()

  exec(code_obj, self.user_global_ns, self.user_ns)
Working on  /home/ec2-user/SageMaker/qbb-work/FreddieMac_LoanLevel_Files/sample_orig_2012.txt: 100%|██████████| 15/15 [00:16<00:00,  1.10s/it]
Working on  /home/ec2-user/SageMaker/qbb-work/FreddieMac_LoanLevel_Files/sample_svcg_2010.txt:  73%|███████▎  | 11/15 [07:02<02:36, 39.09s/it]