In [1]:
import numpy as np
import pandas as pd

In [3]:
X_train = np.load('Data/Train/X.npy')
X_prime_train = np.load('Data/Train/X_prime.npy')
feature_info_train = np.load('Data/Train/feature_information.npy')

In [5]:
# trainDataTrue = pd.DataFrame(data=X_train)
trainData = pd.DataFrame(data=X_prime_train)

In [8]:
print("Train shape:",X_train.shape)

Train shape: (30162, 106)
Test shape: (15060, 106)


In [9]:
isMissingTrain = pd.DataFrame(feature_info_train)

In [10]:
trainData = trainData.where(isMissingTrain > 0)

In [11]:
categoricalFeatures = [[i for i in range(6,13)],
                 [i for i in range(13,29)],
                 [i for i in range(29,36)],
                 [i for i in range(36,50)],
                 [i for i in range(50,56)],
                 [i for i in range(56,61)],
                 [i for i in range(61,63)],
                 [i for i in range(63,104)],
                 [i for i in range(104,106)]]
                 
print(categoricalFeatures)

[[6, 7, 8, 9, 10, 11, 12], [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], [29, 30, 31, 32, 33, 34, 35], [36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], [50, 51, 52, 53, 54, 55], [56, 57, 58, 59, 60], [61, 62], [63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103], [104, 105]]


In [13]:
isCategorical = [i for _l in categoricalFeatures for i in _l]
        
startIndex = [l[0] for l in categoricalFeatures]
endIndex = [l[-1] for l in categoricalFeatures]

#### Method 1: Using mean/mode of individual features

In [25]:
def calculateSampleAverage(trainData, startIndex, endIndex, isCategorical):
    
    numExamples, features = trainData.shape[0], trainData.shape[1]
    # Dictionary to store mean/class prob of features
    meanValues = {}
    varValues = {}
    classProb = {}

    for col in range(features):
        # Categorical feature 
        if col in startIndex:
            start = col
            i = startIndex.index(col)
            end = endIndex[i]
            df = trainData.iloc[:,start:end+1]
            df = df.dropna()
            # Determine mode class
            colProb = np.array([df[c].sum()/len(df) for c in df.columns])
            for i in range(start, end+1):
                classProb[i] = colProb[start-i]

        elif col not in isCategorical:
            # Mean of numerical feature
            mean = trainData[col].mean()
            var = trainData[col].var()
            meanValues[col] = mean
            varValues[col] = var
        else:
            continue
    return meanValues, varValues, classProb

In [26]:
meanValues, varValues, classProb = calculateSampleAverage(trainData, startIndex, endIndex, isCategorical)

In [27]:
meanValues

{0: 0.2928808122974898,
 1: 0.11970850634603727,
 2: 0.6076997611736368,
 3: 0.010959190489203352,
 4: 0.02032728158217498,
 5: 0.40752656854314523}

In [28]:
varValues

{0: 0.03222033140632074,
 1: 0.0051374847647040955,
 2: 0.028906937548843558,
 3: 0.005525735962521754,
 4: 0.00860599060722077,
 5: 0.01484279928558613}

In [30]:
def newData(meanValues, varValues, classProb, startIndex, endIndex):
    numData = 1000
    new = np.zeros((numData, 106))
    for i in range(numData):
        for k in range(6):
            new[i,k] = np.random.normal(meanValues[k], varValues[k])
        for k in range(6,106):
            if k in startIndex:
                start = k
                j = startIndex.index(k)
                end = endIndex[j]
                prob = []
                numClasses = end - start + 1
                for idx in range(start, end+1):
                    prob.append(classProb[idx])
                c = np.random.choice(numClasses, 1, p=prob)
                new[i,c+start] = 1.0
            else:
                continue
    return new

In [31]:
genData = newData(meanValues, varValues, classProb, startIndex, endIndex)

In [33]:
genData = pd.DataFrame(genData)

In [34]:
genMeanValues, genVarValues, genclassProb = calculateSampleAverage(genData, startIndex, endIndex, isCategorical)

In [35]:
genMeanValues

{0: 0.2924214111642054,
 1: 0.11987308540883072,
 2: 0.6077337892767098,
 3: 0.010980774840031286,
 4: 0.020027922821447233,
 5: 0.4064011711731667}

In [36]:
genVarValues

{0: 0.0010464599828144455,
 1: 2.640854148484855e-05,
 2: 0.0008682312236189456,
 3: 3.2599882487270635e-05,
 4: 7.037120497579378e-05,
 5: 0.00021805212418640372}

#### Method 2: Using weights learned from linear regression:


In [37]:
# Using weights from linear regression
w = np.load('Data/weightLinearReg.npy')

In [38]:
def newDataWeighted(w, meanValues, varValues, classProb, startIndex, endIndex):
    numData = 1000
    new = np.zeros((numData, 106))
    for i in range(numData):
        for k in range(6):
            new[i,k] = w[k,k]*np.random.normal(meanValues[k], varValues[k])
        for k in range(6,106):
            if k in startIndex:
                start = k
                j = startIndex.index(k)
                end = endIndex[j]
                prob = []
                numClasses = end - start + 1
                # Choosing class based on class prob
                for idx in range(start, end+1):
                    prob.append(classProb[idx])
                c = np.random.choice(numClasses, 1, p=prob) + start
                new[i,c] = w[c,c]*1.0
            else:
                continue
    return new

In [39]:
genDataWeighted = newDataWeighted(w, meanValues, varValues, classProb, startIndex, endIndex)
genDataWeighted = pd.DataFrame(genDataWeighted)

In [41]:
genWeightedMeanValues,genWeightedVarValues, genWeightedclassProb = calculateSampleAverage(genDataWeighted, startIndex, endIndex, isCategorical)

In [42]:
genWeightedMeanValues

{0: 0.2916991727968813,
 1: 0.11994495657078844,
 2: 0.6093319375609193,
 3: 0.010553311447090522,
 4: 0.020650313658725473,
 5: 0.40714379652824506}

In [43]:
genWeightedVarValues

{0: 0.0010260472269033657,
 1: 2.5932446152676047e-05,
 2: 0.0008757809077890777,
 3: 3.411791418947263e-05,
 4: 7.852458286685193e-05,
 5: 0.00022476134896211924}