Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
julosaure
committed
Aug 14, 2013
1 parent
3ccf0f2
commit e93ac6b
Showing
2 changed files
with
119 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
|
||
import sys, argparse, csv, fileinput, csv, string,re | ||
|
||
NB_COLUMN=42 | ||
|
||
HEADER=""" | ||
@relation LoanStats | ||
@attribute 'Loan ID' string | ||
@attribute 'Amount Requested' numeric | ||
@attribute 'Amount Funded By Investors' numeric | ||
@attribute 'Interest Rate' string | ||
@attribute 'Loan Length' {'36 months','60 months'} | ||
@attribute 'Application Date' date yyyy-MM-dd | ||
@attribute 'Application Expiration Date' date yyyy-MM-dd | ||
@attribute 'Issued Date' date yyyy-MM-dd | ||
@attribute 'CREDIT Grade' {E2,A2,A4,C1,B1,C4,B5,B3,A5,A3,B2,C2,F4,C5,E1,D3,B4,D1,F1,D4,C3,D2,F2,A1,E4,F3,E5,D5,E3,G5,F5,G1,G4,G3} | ||
@attribute 'Loan Title' string | ||
@attribute 'Loan Purpose' {debt_consolidation,other,credit_card,home_improvement,small_business,educational,vacation,moving,car,wedding,house,medical,major_purchase,renewable_energy} | ||
@attribute 'Loan Description' string | ||
@attribute 'Monthly PAYMENT' numeric | ||
@attribute Status {'Fully Paid','Charged Off'} | ||
@attribute 'Total Amount Funded' numeric | ||
@attribute 'Debt-To-Income Ratio' string | ||
@attribute 'Remaining Principal Funded by Investors' numeric | ||
@attribute 'Payments To Date (Funded by investors)' numeric | ||
@attribute 'Remaining Principal ' numeric | ||
@attribute 'Payments To Date' numeric | ||
@attribute 'Screen Name' string | ||
@attribute City string | ||
@attribute State {MA,MD,NC,CO,WI,FL,NY,GA,IN,VA,ME,NJ,CT,AZ,NE,WA,CA,OH,NM,MO,UT,LA,SC,KS,AL,TN,NH,SD,TX,DE,WY,KY,IL,OR,MN,DC,NV,VT,MI,IA,RI,PA,AR,HI,ID,OK,AK,MS,MT,WV} | ||
@attribute 'Home Ownership' {OWN,MORTGAGE,RENT,NONE,OTHER} | ||
@attribute 'Monthly Income' numeric | ||
@attribute 'FICO Range' string | ||
@attribute 'Earliest CREDIT Line' date yyyy-MM-dd | ||
@attribute 'Open CREDIT Lines' numeric | ||
@attribute 'Total CREDIT Lines' numeric | ||
@attribute 'Revolving CREDIT Balance' string | ||
@attribute 'Revolving Line Utilization' string | ||
@attribute 'Inquiries in the Last 6 Months' numeric | ||
@attribute 'Accounts Now Delinquent' numeric | ||
@attribute 'Delinquent Amount' numeric | ||
@attribute 'Delinquencies (Last 2 yrs)' numeric | ||
@attribute 'Months Since Last Delinquency' string | ||
@attribute 'Public Records On File' numeric | ||
@attribute 'Months Since Last Record' string | ||
@attribute Education string | ||
@attribute 'Employment Length' string | ||
@attribute Code string | ||
@attribute 'Initial Listing Status' {F} | ||
@data | ||
""" | ||
|
||
# {660-664,810-814,740-744,690-694,760-764,755-759,785-789,670-674,705-709,715-719,775-779,750-754,675-679,700-704,795-799,770-774,720-724,710-714,'',745-749,680-684,780-784,790-794,665-669,685-689,735-739,800-804,765-769,725-729,695-699,730-734,820-824,805-809,815-819} | ||
# """ + \ # {< 1 year,1 year,2 years,3 years,4 years,5 years,6 years,7 years,8 years,9 years,10+ years} | ||
|
||
def parseCsv(inputCsv, outputCsv): | ||
csvWriter = csv.writer(outputCsv, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL, doublequote=False, escapechar='\\') | ||
with open(inputCsv) as csvFile: | ||
csvReader = csv.reader(csvFile, delimiter=',', quotechar='"') | ||
for row in csvReader: | ||
if len(row) != NB_COLUMN: | ||
continue | ||
# remove "s | ||
row = map(lambda s:string.replace(s, '"', ' '), row) | ||
csvWriter.writerow(row) | ||
|
||
|
||
M_STATUS = re.compile("(Fully Paid|Charged Off)") | ||
|
||
def createArff(inputCsv, output): | ||
output.write(HEADER) | ||
cpt = 0 | ||
for line in fileinput.input(inputCsv): | ||
if cpt == 0: | ||
# skip header line | ||
cpt += 1 | ||
continue | ||
line = line.strip("\r\n") | ||
tab = line.split(",") | ||
print tab[13] | ||
if not M_STATUS.match(tab[13]): | ||
continue | ||
line = ",".join(tab) | ||
output.write(line+"\n") | ||
|
||
def main(inputCsv, outputCsv): | ||
#parseCsv(inputCsv, outputCsv) | ||
createArff(inputCsv, outputCsv) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Cleans an input csv file, and outputs the new csv (to stdout by default).") | ||
parser.add_argument("input", help="the input csv") | ||
parser.add_argument("-o", "--output", help="the output csv") | ||
args = parser.parse_args() | ||
|
||
out = sys.stdout | ||
if args.output: | ||
out = open(args.output, "w") | ||
|
||
main(args.input, out) | ||
|
||
if args.output: | ||
out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
|
||
OPTIONS=" -B 1000 -format yyyy-MM-dd -D 6,7,8,27 " # 5,6,7,24 " | ||
OPTIONS=$OPTIONS" -S 4,10,12,16,21,22,30,38,40 " # 3,9,11,15,20,21,29,37,39 " | ||
OPTIONS=$OPTIONS" -N 5,9,11,14,23,24,26,39,41 " # 4,8,10,13,22,23,24,38,40 " | ||
|
||
#OPTIONS=$OPTIONS" -L 5:36 months,60 months " | ||
CMD="java -cp /Users/julien/workspaces/weka/weka-3-7-9/weka.jar weka.core.converters.CSVLoader $OPTIONS data/LoanStats10k_.csv > data/LoanStats10k_.arff" | ||
|
||
echo $CMD | ||
|
||
$CMD |