# Final Project

## Data-Pre Processing

In [None]:
import gzip
import pandas as pd
from collections import defaultdict,Counter
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Given functions for reading the data
def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
# Load train and test data:
df = getDF('train.json.gz')
test_df = getDF('test_Helpful.json.gz')

In [None]:
# Initial training + validation view
print(df.shape)
df.head(3)

In [None]:
# Initial test view
print(test_df.shape)
test_df.head(3)

In [None]:
# Check nulls:
print("Train:")
print(df.isnull().sum()/df.shape[0])
print('\nTest:')
print(test_df.isnull().sum()/df.shape[0])

Price feature will be dropped, since substantial amount of training data is not present, does not make sense to bias the results, even if the test set has it more readily available.

Considering we have text data, may be valuable to extract a review length column and convert the date column to something meaningful. My preliminary thoughts are that the helpfulness feature could have been retro-actively added, hence it may not have been used early on. My other thought is that seasonality could be at play for shopping, hence some people valuing more comprehensive views around the holidays, etc.

In [None]:
df.drop(['price'], axis=1,inplace=True)
test_df.drop(['price'], axis=1,inplace=True)
df['date'] = pd.to_datetime(df['unixReviewTime'],unit='s')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
test_df['date'] = pd.to_datetime(test_df['unixReviewTime'],unit='s')
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
df['len_review'] = df.reviewText.apply(lambda x: len(str(x).split(' ')))
test_df['len_review'] = test_df.reviewText.apply(lambda x: len(str(x).split(' ')))

In [None]:
# Look at category distribution
df['categoryID'].value_counts().plot.bar()
plt.title('Distribution of Categories Across Training Set')
plt.xlabel('Category')
plt.ylabel('Count');

It would appear that category distribution is not even, and there is a dispraportionate distribution of category = 0.

For use in the algorithm, category ID must be converted to "one-hot" as it is not a numerical feature.

In [None]:
categories = pd.get_dummies(df['categoryID']).rename(columns=lambda x: 'category_'+str(x))
df = pd.concat([df,categories],axis=1)
categories_t = pd.get_dummies(test_df['categoryID']).rename(columns=lambda x: 'category_'+str(x))
test_df = pd.concat([test_df,categories_t],axis=1)

Rationale behind using sentiment on the summary is that we are still programmed to look at headlines, my guess is the more a comment appears to be attention grabbing (either extremely negative or positive), the more it will be viewed. The more it will be viewed, the more likely it could be classified as helpful or not.

In [None]:
# Let us add some sentiment factor into the model:
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

analyser = SIA()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return score.get("compound")

In [None]:
# Summary compound sentiment:
df['summary_sent'] = df.summary.apply(lambda x: sentiment_analyzer_scores(x))
test_df['summary_sent'] = test_df.summary.apply(lambda x: sentiment_analyzer_scores(x))

Last step, clean target variable and extract information:

In [None]:
helpful = pd.DataFrame.from_dict(dict(df['helpful'])).T
df = pd.concat([df, helpful], axis=1)
helpful_t = pd.DataFrame.from_dict(dict(test_df['helpful'])).T
test_df = pd.concat([test_df, helpful_t], axis=1)

In [None]:
# Final Training+Validation Data
print(df.shape)
print(df.columns)
df.head(3)

In [None]:
# Final Test Data
print(test_df.shape)
test_df.head(3)

In [None]:
# If model value is poor, potentially investigate using more information from the categories:
df.iloc[4]['categories'][2]

## Training Set Analysis

In [None]:
feature_list = ['rating','year', 'month', 'len_review', 'category_0', \
       'category_1', 'category_2', 'category_3', 'category_4', 'summary_sent', \
       'outOf']
fin_test = test_df
feature_list.append('nHelpful')
fin_train = df[feature_list]
fin_train['helpful_rate'] = fin_train['nHelpful']/fin_train['outOf']

In [None]:
print('Dimensions:',fin_train.shape)
print(fin_train.columns)
print(fin_train.isnull().sum()/fin_train.shape[0])
fin_train.head()

In [None]:
# Would appear that a good chunk of our data can be represented as a classification problem
fin_train['helpful_rate'].value_counts().nlargest(10)/fin_train.shape[0]*100

In [None]:
# Capture some summary statistics:
fin_train.describe().T

Preliminary findings: High variability in target variable, with some outliers. Some decent variation in sentiment analysis. Year and month extraction as features appears to be valuable, for rating feature and seasonality respectively. Rating seems like it will be quite helpful as well. If we look at the ratio between "Out of" and "n Helpful", we can see that the majority of the results is a 1:1 - meaning a combination of classification and regression should be robust.

Now that we have a complete training set, let us visualize some of the results:

In [None]:
fin_train.hist(bins = 50, figsize=(20,12));

In [None]:
import seaborn as sns
cov_mat = fin_train.cov() # to get a heatmap of the covariance matrix
cov_plot = sns.heatmap(cov_mat, vmax=1, square = True,cmap="Blues")
cov_plot.set_title('Training Set COV Matrix');

In [None]:
cor_mat = fin_train.corr() # to get a heatmap of the correlation matrix
cor_plot = sns.heatmap(cor_mat, vmax=1, square = True,cmap="Blues")
cor_plot.set_title('Training Set Corr. Matrix');

It would appear that, for the most part, the "outOf" feature highly correlates with the target variable. It should be considered the most important feature in the model, with review length also being higher correlrated with the value as well. Based on my analysis, tree based models should be robust.

## Model Building

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV,StratifiedKFold

X, y = fin_train.iloc[:,0:11],fin_train.iloc[:,11]

In [None]:
data_dmatrix = xgb.DMatrix(data=fin_train.iloc[:,0:11],label=fin_train.iloc[:,11])

params = {"objective":"reg:linear",'colsample_bytree': 0.3,
                'max_depth': 4, 'alpha': 9,'n_estimators':100}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=200,early_stopping_rounds=10,metrics="mae", as_pandas=True, seed=123)

In [None]:
cv_results[20:].drop(['test-mae-std','train-mae-std'],axis=1).plot()
plt.xlabel('Boost Round')
plt.ylabel('Mean Absolute Error')
plt.title('Base XGBoost Regression Overfits');

Model barely outperforms baseline. To improve it, we will need to apply grid search and use of a validation set across k-fold CV to tune model parameters:

In [None]:
def fin_results(row):
    if row['outOf'] == 0:
        val = 0
    else:
        val = row['xgb_regress']
    return val

In [None]:
import warnings
warnings.filterwarnings('ignore')

mae = []
mae_rd = []
feature_list = ['rating','year', 'month', 'len_review', 'category_0', \
       'category_1', 'category_2', 'category_3', 'category_4', 'summary_sent', \
       'outOf']
iteration = 0
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for train_index,test_index in kf.split(X,y):
    print('\n{} of kfold {}'.format(iteration,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y[train_index],y[test_index]
    CV_params = dict(learning_rate=[0.01, 0.1])
    params = {"objective":"reg:linear",'colsample_bytree': 0.3,
                'max_depth': 4, 'alpha': 9,'n_estimators':100}
    xgb_reg = GridSearchCV(xgb.XGBRegressor(**params),CV_params,cv = 10, scoring = 'mean_absolute_error',n_jobs = -1)
    xgb_reg.fit(xtr,ytr)
    print("Best parameters:")
    print(xgb_reg.best_params_)
    
    # Predicting on the validation set:
    xvl = xvl.reset_index(drop = True)
    model_perf = pd.DataFrame({'outOf':xvl['outOf'], 'xgb_regress':xgb_reg.predict(xvl)})
    model_perf['pred'] = model_perf.apply(fin_results,axis = 1)

    
    print("MAE w/o rounding: %0.3f" % mean_absolute_error(list(yvl), list(model_perf.pred)))
    mae.append(mean_absolute_error(list(yvl), list(model_perf.pred)))
    print("MAE w/ rounding: %0.3f" % mean_absolute_error(list(yvl), list(np.round(model_perf.pred))))
    mae_rd.append(mean_absolute_error(list(yvl), list(np.round(model_perf.pred))))
    iteration += 1
     
print('Without rounding, "mean" mae: %0.3f' % np.mean(mae))
print('With rounding, "mean" mae: %0.3f' % np.mean(mae_rd))

In [None]:
plt.rcParams['figure.figsize'] = 60, 30
xgb.plot_tree(xgb_reg.best_estimator_,num_trees=2)
plt.rcParams['figure.figsize'] = 8, 6

In [None]:
# Initial Model
features = xvl.columns
importances = xgb_reg.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.ylabel('Feature');

Trying to model the feature directly is not the right approach as there is too much variability present and heavily relies on outOf (see example tree): the base model was better off. Next, we want to implement some sort of iterated classifier based on the ratio. Should be a linear combination of 'outOf' and the ratio.

Additionally, these libraries were not properly allowing for iterated K fold and grid search based on said folds, creating new validation sets. We implement an iterated random approach to test on new validation sets each time.

In [None]:
def n_train_split(dataframe, features, target, random_state):
    X_train = pd.DataFrame(dataframe, columns=features)
    y_train = pd.DataFrame(dataframe[target])
    X_train_n, X_valid, y_train_n, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=i)
    return X_train_n, y_train_n, X_valid, y_valid

def fin_results2(row):
    if row['outOf'] == 0:
        val = 0
    elif row['outOf'] == 1:
        val = row['outOf']*row['xgb_class']
    else:
        val = row['outOf']*row['xgb_reg']
    return val

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn import ensemble
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold 
from sklearn.metrics import mean_absolute_error

In [None]:
import warnings
warnings.filterwarnings('ignore')

mae = []
mae_rd = []
features = ['rating', 'year', 'month', 'len_review', 'category_0', 'category_1',\
       'category_2', 'category_3', 'category_4', 'summary_sent', 'outOf']
iteration = 0
for i in range(5):
    print('\n{} of kfold {}'.format(iteration,5))
    X_train_n, y_train_n, X_valid, y_valid = n_train_split(fin_train, features, ['helpful_rate', 'nHelpful'], i)
    
    # Leverage binary representation for classifier when ratio = 1
    X_train_cl = X_train_n[X_train_n['outOf'] == 1]
    y_train_cl = np.array(y_train_n['helpful_rate'][X_train_n['outOf'] == 1])
    kf_cl = StratifiedKFold(y_train_cl, n_folds=5, shuffle=True, random_state=0)
    grid_search = dict(learning_rate=[0.01,0.05,0.1],loss=['deviance','exponential'])
    params = {'n_estimators': 100, 'max_depth': 4}
    gbclf = GridSearchCV(ensemble.GradientBoostingClassifier(**params), grid_search, cv=kf_cl, scoring='mean_absolute_error',n_jobs=-1)
    gbclf.fit(X_train_cl, y_train_cl)
    print("Best classification parameters:")
    print(gbclf.best_params_)
    
    # Utilize regression otherwise
    X_train_reg = X_train_n[(X_train_n['outOf'] != 0) & (X_train_n['outOf'] != 1)]
    y_train_reg = np.array(y_train_n['helpful_rate'][(X_train_n['outOf'] != 0) & (X_train_n['outOf'] != 1)])
    kf_reg = StratifiedKFold(y_train_reg, n_folds=5, shuffle=True, random_state=0)
    grid_search = dict(learning_rate=[0.01,0.05,0.1], loss=['ls', 'lad'])
    params = {'n_estimators': 100, 'max_depth': 4}
    gbreg = GridSearchCV(ensemble.GradientBoostingRegressor(**params), grid_search, cv=kf_reg, scoring='mean_absolute_error', n_jobs=-1)
    gbreg.fit(X_train_reg, y_train_reg)
    print("Best regression parameters:")
    print(gbreg.best_params_)
    
    # Predicting on the validation set:
    X_valid.reset_index(drop=True,inplace=True)
    model_perf = pd.DataFrame({'outOf':X_valid['outOf'], 'xgb_class':gbclf.predict(X_valid), 'xgb_reg':gbreg.predict(X_valid)})
    model_perf['pred'] = model_perf.apply(fin_results2,axis = 1)
    
    print("MAE w/o rounding: %0.3f" % mean_absolute_error(list(y_valid['nHelpful']), list(model_perf.pred)))
    mae.append(mean_absolute_error(list(y_valid['nHelpful']), list(model_perf.pred)))
    print("MAE w/ rounding: %0.3f" % mean_absolute_error(list(y_valid['nHelpful']), list(np.round(model_perf.pred))))
    mae_rd.append(mean_absolute_error(list(y_valid['nHelpful']), list(np.round(model_perf.pred))))
    iteration += 1

print('\nFinal training results:')
print('Without rounding, "mean" mae: %0.3f' % np.mean(mae))
print('With rounding, "mean" mae: %0.3f' % np.mean(mae_rd))

In [None]:
# Tree-Classifier
features = X_train_cl.columns
importances = gbclf.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.ylabel('Feature');

In [None]:
# Tree-Regressor
features = X_train_reg.columns
importances = gbreg.best_estimator_.feature_importances_
indices = np.argsort(importances)

plt.title('Coefficients')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Coefficients')
plt.ylabel('Feature');

In [None]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

mae = []
mae_rd = []
features = ['rating', 'year', 'month', 'len_review', 'category_0', 'category_1',\
       'category_2', 'category_3', 'category_4', 'summary_sent', 'outOf']
iteration = 0
for i in range(5):
    print('\n{} of kfold {}'.format(iteration,5))
    X_train_n, y_train_n, X_valid, y_valid = n_train_split(fin_train, features, ['helpful_rate', 'nHelpful'], i)
    
    # Leverage binary representation for classifier when ratio = 1, try logistic regression
    X_train_cl = X_train_n[X_train_n['outOf'] == 1]
    y_train_cl = np.array(y_train_n['helpful_rate'][X_train_n['outOf'] == 1])
    kf_cl = StratifiedKFold(y_train_cl, n_folds=5, shuffle=True, random_state=0)
    parameters = { 'penalty': ['l1','l2'], 
              'C':[0.1, 0.5, 1, 2, 3, 4, 5, 10]}
    logreg = LogisticRegression()
    logclf = GridSearchCV(logreg, parameters, cv=kf_cl, scoring='mean_absolute_error',n_jobs=-1)
    logclf.fit(X_train_cl, y_train_cl)
    print("Best classification parameters:")
    print(logclf.best_params_)
    
    # Utilize regression otherwise
    X_train_reg = X_train_n[(X_train_n['outOf'] != 0) & (X_train_n['outOf'] != 1)]
    y_train_reg = np.array(y_train_n['helpful_rate'][(X_train_n['outOf'] != 0) & (X_train_n['outOf'] != 1)])
    kf_reg = StratifiedKFold(y_train_reg, n_folds=5, shuffle=True, random_state=0)
    grid_search = dict(learning_rate=[0.01,0.05,0.1], loss=['ls', 'lad'])
    params = {'n_estimators': 100, 'max_depth': 4}
    gbreg = GridSearchCV(ensemble.GradientBoostingRegressor(**params), grid_search, cv=kf_reg, scoring='mean_absolute_error', n_jobs=-1)
    gbreg.fit(X_train_reg, y_train_reg)
    print("Best regression parameters:")
    print(gbreg.best_params_)
    
    # Predicting on the validation set:
    X_valid.reset_index(drop=True,inplace=True)
    model_perf = pd.DataFrame({'outOf':X_valid['outOf'], 'xgb_class':logclf.predict(X_valid), 'xgb_reg':gbreg.predict(X_valid)})
    model_perf['pred'] = model_perf.apply(fin_results2,axis = 1)
    
    print("MAE w/o rounding: %0.3f" % mean_absolute_error(list(y_valid['nHelpful']), list(model_perf.pred)))
    mae.append(mean_absolute_error(list(y_valid['nHelpful']), list(model_perf.pred)))
    print("MAE w/ rounding: %0.3f" % mean_absolute_error(list(y_valid['nHelpful']), list(np.round(model_perf.pred))))
    mae_rd.append(mean_absolute_error(list(y_valid['nHelpful']), list(np.round(model_perf.pred))))
    iteration += 1

print('\nFinal training results:')
print('Without rounding, "mean" mae: %0.3f' % np.mean(mae))
print('With rounding, "mean" mae: %0.3f' % np.mean(mae_rd))

## Model Selection

In [None]:
# Let us gather the parameters to properly export:

# GB Regressor: across both CVs, we know best tuning parameter: {'learning_rate': 0.1, 'loss': 'lad'}
print('Full GB regress model parameters:\n\n',gbreg.best_estimator_,'\n')

# Logistic Regression, using best GB Regressor, tune parameters to: {'C': 0.1, 'penalty': 'l1'}
print('Full Logistic model parameters:\n\n',logclf.best_estimator_,'\n')

# GB Classifier, tune parameters to EITHER: {'learning_rate': 0.05, 'loss': 'exponential'} OR {'learning_rate': 0.05, 'loss': 'deviance'}
print('Full GB classifier model parameters:\n\n',gbclf.best_estimator_,'\n')

## First 2 submissions (log reg + xgb regression)

In [None]:
warnings.filterwarnings('ignore')
X_train_fin = fin_train[features]
y_train_fin=pd.DataFrame(fin_train['helpful_rate'])

# Perform predictions on the test set, using full training data: first logistic regression
X_train_cl = X_train_fin[X_train_fin['outOf'] == 1]
y_train_cl = np.array(y_train_fin['helpful_rate'][X_train_fin['outOf'] == 1])
logclf = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

logclf.fit(X_train_cl, y_train_cl)
    
# Utilize regression otherwise:
X_train_reg = X_train_fin[(X_train_fin['outOf'] != 0) & (X_train_fin['outOf'] != 1)]
y_train_reg = np.array(y_train_fin['helpful_rate'][(X_train_fin['outOf'] != 0) & (X_train_fin['outOf'] != 1)])
gbreg = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False) 
gbreg.fit(X_train_reg, y_train_reg)
    
# Predicting on the test set:
test_set = pd.DataFrame(fin_test,columns = features)
final_model1 = pd.DataFrame({'outOf':test_set['outOf'], 'xgb_class':logclf.predict(test_set), 'xgb_reg':gbreg.predict(test_set)})
final_model1['pred'] = final_model1.apply(fin_results2,axis = 1)

In [None]:
final_model1.rename(columns={'xgb_class':'log_reg','pred': 'prediction'}, inplace=True)

In [None]:
export_und1 = pd.concat([fin_test[['reviewerID','itemID','outOf']],final_model1[['prediction']]],axis=1)
export_und2 = pd.concat([fin_test[['reviewerID','itemID','outOf']],final_model1[['prediction']]],axis=1)
export_und2['prediction'] = np.round(final_model1['prediction'])

In [None]:
def concat_export(row):
    val = row['reviewerID'] + '-'+ row['itemID'] + '-' + str(row['outOf'])
    
    return val

export_und1['userID-itemID-outOf'] = export_und1.apply(concat_export,axis = 1)
export_und2['userID-itemID-outOf'] = export_und2.apply(concat_export,axis = 1)

In [None]:
export_und1[['userID-itemID-outOf','prediction']].to_csv('log_reg_nround.csv',index=False)
export_und2[['userID-itemID-outOf','prediction']].to_csv('log_reg_round.csv',index=False)

## Second 2 submissions (gb clf + gb reg)

In [None]:
## Let's try the gradient booster, potentially logistic regression has overfit:
warnings.filterwarnings('ignore')
X_train_fin = fin_train[features]
y_train_fin=pd.DataFrame(fin_train['helpful_rate'])

# Perform predictions on the test set, using full training data: first logistic regression (change exponential or deviance)
X_train_cl = X_train_fin[X_train_fin['outOf'] == 1]
y_train_cl = np.array(y_train_fin['helpful_rate'][X_train_fin['outOf'] == 1])
gbclf = ensemble.GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='exponential', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

gbclf.fit(X_train_cl, y_train_cl)
    
# Utilize regression otherwise:
X_train_reg = X_train_fin[(X_train_fin['outOf'] != 0) & (X_train_fin['outOf'] != 1)]
y_train_reg = np.array(y_train_fin['helpful_rate'][(X_train_fin['outOf'] != 0) & (X_train_fin['outOf'] != 1)])
gbreg = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False) 
gbreg.fit(X_train_reg, y_train_reg)
    
# Predicting on the test set:
test_set = pd.DataFrame(fin_test,columns = features)
final_model2 = pd.DataFrame({'outOf':test_set['outOf'], 'xgb_class':gbclf.predict(test_set), 'xgb_reg':gbreg.predict(test_set)})
final_model2['pred'] = final_model2.apply(fin_results2,axis = 1)

In [None]:
final_model2.rename(columns={'pred': 'prediction'}, inplace=True)
export_und3 = pd.concat([fin_test[['reviewerID','itemID','outOf']],final_model2[['prediction']]],axis=1)
export_und4 = pd.concat([fin_test[['reviewerID','itemID','outOf']],final_model2[['prediction']]],axis=1)
export_und4['prediction'] = np.round(final_model2['prediction'])

In [None]:
export_und3['userID-itemID-outOf'] = export_und3.apply(concat_export,axis = 1)
export_und4['userID-itemID-outOf'] = export_und4.apply(concat_export,axis = 1)

In [None]:
export_und3[['userID-itemID-outOf','prediction']].to_csv('05exp_xgb_clf_nround.csv',index=False)
export_und4[['userID-itemID-outOf','prediction']].to_csv('05exp_xgb_clf_round.csv',index=False)