# I.A Engineering the input: Preparing the datasets

The fit.dat file is a space seperated file. The .ARFF format we require for Weka takes the data as a comma-seperated file format. With the python code below we can convert the file quickly with minimal human error. The fit.dat file goes from the space-seperated data set to a comma-seperated data set.

Space Seperated (fit.dat):

    22 85 203 174 9 0 362 40 0
    21 87 186 165 5 0 379 32 0
    30 107 405 306 25 0 756 99 0
    ...


Comma Seperated (fit.csv):

    22,85,203,174,9,0,362,40,0
    21,87,186,165,5,0,379,32,0
    30,107,405,306,25,0,756,99,0
    ...

Then we can add the attribute labels and comments to create the .arff file for weka to interpret the classes. The 

   
    % ADA Software Fault Training Data
    ...
    @relation prediction_data_frame 

    @attribute NUMUORS real 
    @attribute NUMUANDS real 
    @attribute TOTOTORS real 
    @attribute TOTOPANDS real 
    @attribute VG real
    ...


    % ADA Software Fault Test Data
    ...
    @relation prediction_data_frame 

    @attribute NUMUORS real 
    @attribute NUMUANDS real 
    @attribute TOTOTORS real 
    @attribute TOTOPANDS real 
    @attribute VG real
    ...

   

## Preprocessing Data for Prediction

In [150]:
import csv

#Convert data files into csv .arff file
ConvertSpaceToCSV("fit.dat","fit_pred.arff")
ConvertTabToCSV("test.dat","test_pred.arff")

fit_comments = """
% ADA Software Fault Training Data
% Author: Hector Lopez
% 9 attributes
% 188 instances
% fit.arff 
"""

test_comments = """
% ADA Software Fault Test Data
% Author: Hector Lopez
% 9 attributes
% 94 instances
% test.arff 
"""

attributes ="""

@relation prediction_data_frame 

@attribute NUMUORS real 
@attribute NUMUANDS real 
@attribute TOTOTORS real 
@attribute TOTOPANDS real 
@attribute VG real 
@attribute NLOGIC real 
@attribute LOC real 
@attribute ELOC real 
@attribute FAULTS real 

@data 
"""

#Add comments and weka attribs to the csv file and save as arf
AddWekaFeatures("fit_pred.arff",fit_comments + attributes);
AddWekaFeatures("test_pred.arff",test_comments + attributes);


print(open("fit_pred.arff","r",encoding="UTF8").read(500))
print("...")

print(open("test_pred.arff","r",encoding="UTF8").read(500))
print("...")



% ADA Software Fault Training Data
% Author: Hector Lopez
% 9 attributes
% 188 instances
% fit.arff 


@relation prediction_data_frame 

@attribute NUMUORS real 
@attribute NUMUANDS real 
@attribute TOTOTORS real 
@attribute TOTOPANDS real 
@attribute VG real 
@attribute NLOGIC real 
@attribute LOC real 
@attribute ELOC real 
@attribute FAULTS real 

@data 
22,85,203,174,9,0,362,40,0
21,87,186,165,5,0,379,32,0
30,107,405,306,25,0,756,99,0
6,5,19,6,2,0,160,9,0
21,47,168,148,7,0,352,29,0
28,38,16
...

% ADA Software Fault Test Data
% Author: Hector Lopez
% 9 attributes
% 94 instances
% test.arff 


@relation prediction_data_frame 

@attribute NUMUORS real 
@attribute NUMUANDS real 
@attribute TOTOTORS real 
@attribute TOTOPANDS real 
@attribute VG real 
@attribute NLOGIC real 
@attribute LOC real 
@attribute ELOC real 
@attribute FAULTS real 

@data 
6,12,127,45,10,0,641,55,0
5,5,41,12,1,0,407,17,0
23,28,95,66,4,2,241,20,0
5,5,35,20,1,0,254,14,0
6,10,43,26,1,0,264,17,0
3,6,25,6,1,0,279,


## Preprocessing Data for Classification

In [151]:
#Convert data files into csv .arff file
ConvertSpaceToCSV("fit.dat","fit_class.csv")
ConvertTabToCSV("test.dat","test_class.csv")

AddClassFeature("fit_class.csv","fit_class.arff")
AddClassFeature("test_class.csv","test_class.arff")
                
fit_comments = """
% ADA Software Fault Training Data
% Author: Hector Lopez
% 9 attributes
% 188 instances
% fit.arff 
"""

test_comments = """
% ADA Software Fault Test Data
% Author: Hector Lopez
% 9 attributes
% 94 instances
% test.arff 
"""

attributes ="""

@relation classification_data_frame 

@attribute class {fp,nfp}
@attribute NUMUORS real 
@attribute NUMUANDS real 
@attribute TOTOTORS real 
@attribute TOTOPANDS real 
@attribute VG real 
@attribute NLOGIC real 
@attribute LOC real 
@attribute ELOC real 
@attribute FAULTS real 


@data 
"""


#Add comments and weka attribs to the csv file and save as arf
AddWekaFeatures("fit_class.arff",fit_comments + attributes);
AddWekaFeatures("test_class.arff",test_comments + attributes);

print(open("fit_class.arff","r",encoding="UTF8").read(500))
print("...")

print(open("test_class.arff","r",encoding="UTF8").read(500))
print("...")



% ADA Software Fault Training Data
% Author: Hector Lopez
% 9 attributes
% 188 instances
% fit.arff 


@relation classification_data_frame 

@attribute class {fp,nfp}
@attribute NUMUORS real 
@attribute NUMUANDS real 
@attribute TOTOTORS real 
@attribute TOTOPANDS real 
@attribute VG real 
@attribute NLOGIC real 
@attribute LOC real 
@attribute ELOC real 
@attribute FAULTS real 


@data 
nfp,22,85,203,174,9,0,362,40,0
nfp,21,87,186,165,5,0,379,32,0
nfp,30,107,405,306,25,0,756,99,0
nfp,6,5,19,6,
...

% ADA Software Fault Test Data
% Author: Hector Lopez
% 9 attributes
% 94 instances
% test.arff 


@relation classification_data_frame 

@attribute class {fp,nfp}
@attribute NUMUORS real 
@attribute NUMUANDS real 
@attribute TOTOTORS real 
@attribute TOTOPANDS real 
@attribute VG real 
@attribute NLOGIC real 
@attribute LOC real 
@attribute ELOC real 
@attribute FAULTS real 


@data 
nfp,6,12,127,45,10,0,641,55,0
nfp,5,5,41,12,1,0,407,17,0
nfp,23,28,95,66,4,2,241,20,0
nfp,5,5,35,20,1,0,254

# Support Functions

In [147]:
def ConvertSpaceToCSV(source_file,dest_file):
    with open(source_file, "r") as infile:
        reader = csv.reader(infile, delimiter=' ')
        with open(dest_file, "w", newline='') as outfile:
            writer = csv.writer(outfile, delimiter = ',')
            for row in reader:
                writer.writerow(row)

def ConvertTabToCSV(source_file,dest_file):
    with open(source_file, "r") as infile:
        reader = csv.reader(infile, delimiter='\t')
        with open(dest_file, "w", newline='') as outfile:
            writer = csv.writer(outfile, delimiter = ',')
            for row in reader:
                writer.writerow(row)

def AddWekaFeatures(file,header):
    infile = open(file, "r");
    intext = header + infile.read();
    outfile = open(file, "w");
    outfile.write(intext);
    

#Add A nfp(non-fault-prone)/fp(fault-prone) class to each row
def AddClassFeature(source_file,dest_file) :
    with open(source_file, "r") as infile:
            reader = csv.reader(infile, delimiter=' ')
            with open(dest_file, "w", newline='') as outfile:
                writer = csv.writer(outfile, delimiter = ',')
                for row in reader:
                    vals=row[0].split(",")
                    idx=len(vals)-1
                    if(int(vals[idx])>=2):
                        vals.insert(0,"fp")
                    else:
                        vals.insert(0,"nfp")
                    writer.writerow(vals)

# I.B Modeling assignment: Prediction

Gra[ the outptus from weka

In [None]:
import matplotlib.pyplot as plt

pred_files = ['LM_GRDY_TestData_PredOutput','LM_M5_TestData_PredOutput','LM_None_TestData_PredOutput']

for f in pred_files :
    actuals = []
    predicted =[]
    with open(f, "r") as infile:
            reader = csv.reader(infile, delimiter=' ')
            for row in reader:
                    vals=row[0].split(",")
                    actuals.add(vals[2])
                    predicted.add(vals[3])
    t = np.arange(0, 1, len(actuals))
    plt.plot(t,actuals,'r--',predicted,'b--')
    plt.show()
