In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import csv
filename = 'Pypi_sample.csv'

In [2]:
# Load the dataset
feat_labels = ''
X = ''
y = ''
with open(filename, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    data = []
    for row in reader :
        data.append(row)
    feat_labels = data[0][:-1]
    X = [row[:-1] for row in data[1:]]
    y = [row[-1] for row in data[1:]]
print(feat_labels)

['Repository Fork?', 'Repository Forks Count', 'Repository Issues enabled?', 'Repository Wiki enabled?', 'Repository Pages enabled?', 'Repository Open Issues Count', 'Repository Readme filename', 'Repository Changelog filename', 'Repository Contributing guidelines filename', 'Repository License filename', 'Repository Code of Conduct filename']


In [3]:
# View the features
X[0:5]

[['0', '0.004419376', '1', '1', '1', '0.00905592', '1', '0', '0', '1', '0'],
 ['0', '0.000395508', '1', '1', '0', '0.000452796', '0', '0', '0', '0', '0'],
 ['0', '0.000894193', '1', '1', '0', '0.006112746', '1', '1', '0', '1', '0'],
 ['0', '6.88E-05', '1', '1', '0', '0.000226398', '1', '0', '0', '1', '0'],
 ['0', '1.72E-05', '1', '0', '0', '0.000226398', '1', '0', '0', '1', '0']]

In [4]:
# View the target data
y

['499',
 '54',
 '236',
 '15',
 '18',
 '18',
 '917',
 '15',
 '102',
 '25',
 '199',
 '15',
 '228',
 '176',
 '17',
 '32',
 '30',
 '525',
 '11',
 '30',
 '164',
 '10',
 '89',
 '23',
 '221',
 '17',
 '68',
 '20',
 '70',
 '1398',
 '13',
 '516',
 '15',
 '24',
 '30',
 '61',
 '779',
 '10',
 '50',
 '85',
 '33',
 '82',
 '256',
 '12',
 '21',
 '16',
 '12',
 '1620',
 '10',
 '90',
 '50',
 '92',
 '896',
 '12',
 '848',
 '15',
 '13',
 '26',
 '509',
 '11',
 '43',
 '22',
 '38',
 '17',
 '57',
 '72',
 '87',
 '19',
 '15',
 '54',
 '13',
 '44',
 '164',
 '520',
 '28',
 '73',
 '455',
 '30',
 '78',
 '20',
 '299',
 '23',
 '12',
 '19',
 '891',
 '25',
 '76',
 '182',
 '135',
 '15',
 '20',
 '14',
 '163',
 '32',
 '21',
 '16',
 '35',
 '40',
 '13',
 '22',
 '42',
 '81',
 '12',
 '37',
 '17',
 '60',
 '13',
 '6152',
 '34',
 '33',
 '32',
 '19',
 '19',
 '32',
 '37',
 '91',
 '18',
 '6152',
 '16',
 '67',
 '11',
 '56',
 '520',
 '30',
 '2316',
 '97',
 '57',
 '13',
 '13',
 '15',
 '115',
 '170',
 '317',
 '73',
 '63',
 '148',
 '15',
 '

In [5]:
# Split the data into 40% test and 60% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [6]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

('Repository Fork?', 0.010596142275311426)
('Repository Forks Count', 0.4360884938027275)
('Repository Issues enabled?', 0.008605100322595354)
('Repository Wiki enabled?', 0.03803016650074246)
('Repository Pages enabled?', 0.04022370866487121)
('Repository Open Issues Count', 0.3231847793153299)
('Repository Readme filename', 0.006586140444778428)
('Repository Changelog filename', 0.048060135947898826)
('Repository Contributing guidelines filename', 0.031050494889021202)
('Repository License filename', 0.0376667565402451)
('Repository Code of Conduct filename', 0.019908081296480213)


In [7]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.15)

In [8]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

Repository Forks Count
Repository Open Issues Count


In [9]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)