In [1]:
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ImportError: No module named pandas

In [85]:
def clean_up(df, ports_dict):
    # Clean up the data.
    # Columns with nulls: Age, Embarked
    # Columns that need to be converted to ints: Sex, Embarked
    # Columns that should be dropped: Name, PassengerId, Cabin, Ticket

    # Make NA ages the median of all ages
    # Possible improvement: Use median age by gender
    median_age = df['Age'].dropna().median()
    df.loc[df['Age'].isnull(), 'Age'] = median_age
    #print(median_age)
    
    male_ages = df.loc[df.Sex == 'male', 'Age']
    female_ages = df.loc[df.Sex == 'female', 'Age']
    
    print(len(female_ages))
    print(male_ages.dropna().median())
    print(female_ages.dropna().median())
    
    # Make NA Embarked be the mode.
    # Possible improvement: Randomly sample using the same distribution of ports
    df.loc[df.Embarked.isnull(), 'Embarked'] = df.Embarked.dropna().mode().values
    
    df.Embarked = df.Embarked.map(lambda x: ports_dict[x]).astype(int)
    #print(ports_dict)

    # Convert Sex to integer
    df['Sex'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # Remove the columns that should be dropped
    df = df.drop(['Name', 'PassengerId', 'Cabin', 'Ticket'], axis=1) 
    
    # Missing Fares should be median of their respective classes
    median_fare = np.zeros(3)
    for f in range(0,3):
        median_fare[f] = df[df.Pclass == f+1]['Fare'].dropna().median()
    for f in range(0,3):
        df.loc[df.Fare.isnull() & df.Pclass == f+1, 'Fare'] = median_fare[f]
    
    return(df)

# Load the training data.
train_df = pd.read_csv('train-original.csv', header=0)

# Convert Embarked to int
# Breaking this down:
# np.unique returns unique values in the input 
# enumerate gives you an object that can give you tuples of index and value
ports = enumerate(np.unique(train_df['Embarked']))
ports_dict = { name: i for i, name in ports }

train_df = clean_up(train_df, ports_dict)

# Test Data
test_df = pd.read_csv('test.csv', header=0)  
ids = test_df['PassengerId'].values
test_df = clean_up(test_df, ports_dict)

314
28.0
28.0
152
27.0
27.0


# Random Forest
Best score: 0.72

In [80]:
# Training
# sklearn needs numpy arrays, not DataFrames, so convert back to a numpy array
train_data = train_df.values
test_data = test_df.values

#print(train_df.info())
#print(test_df.info())

forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit(train_data[0::,1::], train_data[0::,0])

# Calculate training error
score = forest.score(train_data[0::,1::], train_data[0::,0])
print(score * 100)

print(forest.feature_importances_)
print(train_df.info())

output = forest.predict(test_data[0::,1::]).astype(int)

predictions_file = open("myfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()

97.9797979798
[ 0.08196316  0.26842242  0.25356262  0.05037553  0.03892133  0.27223633
  0.03451861]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(6)
memory usage: 55.8 KB
None


# Logistic Regression
Best score: 0.75598

In [78]:
logistic = LogisticRegression()
logistic = logistic.fit(train_data[0::,1::], train_data[0::,0])

# Calculate training error
score = logistic.score(train_data[0::,1::], train_data[0::,0])
print(score * 100)

output = logistic.predict(test_data[0::,1::]).astype(int)

predictions_file = open("logistic.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()



80.0224466891
