# The N2 dataset

The N2 dataset provides features that are hypothesized to be informative for a progression to psychosis.

In [13]:
import pandas

Get the input file path from the calf project

In [14]:
input_file_path = "../../../data/n2.csv"

Read the input file into a DataFrame

In [15]:
df = pandas.read_csv(input_file_path, header=0, sep=",")
df.head()

Unnamed: 0,ctrl/case,ADIPOQ,SERPINA3,AMBP,A2M,ACE,AGT,APOA1,APOA2,APOA4,...,CALCA,IL6,LTA,CSF3,PGF,GCG_0001,IL1B,TGFB3,FGF2,MDA-LDL
0,0,1.1538,-1.008,0.465,-0.6181,-0.935,1.7169,0.974,1.7821,-0.258,...,-0.3688,0.8739,-0.239,2.8335,0.4469,0.101,0.1688,-0.1861,1.9591,-0.072
1,0,-0.7661,-1.039,1.2479,0.222,-0.714,2.6709,-0.275,0.168,0.9759,...,-0.3688,1.5408,3.5482,-0.6669,-0.777,1.015,0.1688,0.2689,-0.3498,-0.5491
2,0,-0.2721,-0.766,-0.748,-1.0371,0.0459,-0.294,0.046,0.932,-0.505,...,0.2562,0.206,-0.8099,-0.6669,-0.777,0.372,-0.5152,-0.1861,-0.3498,-0.5491
3,0,-0.8201,-1.281,0.465,-0.198,0.8059,0.898,-0.954,-0.427,-0.505,...,-0.3688,-0.5718,-0.8099,-0.6669,-0.505,-0.543,0.1688,0.2689,-0.5978,-0.5491
4,0,0.0019,-1.188,-0.709,0.6421,0.2039,-0.35,-0.275,0.507,0.2349,...,0.0612,1.8737,1.0388,-0.6669,0.4469,0.575,-0.2332,-0.4131,0.4472,-0.5491


Remove the outcome column to get the independent variables

In [16]:
X = df.loc[:, df.columns != 'ctrl/case']
X.head()

Unnamed: 0,ADIPOQ,SERPINA3,AMBP,A2M,ACE,AGT,APOA1,APOA2,APOA4,APOH,...,CALCA,IL6,LTA,CSF3,PGF,GCG_0001,IL1B,TGFB3,FGF2,MDA-LDL
0,1.1538,-1.008,0.465,-0.6181,-0.935,1.7169,0.974,1.7821,-0.258,2.6529,...,-0.3688,0.8739,-0.239,2.8335,0.4469,0.101,0.1688,-0.1861,1.9591,-0.072
1,-0.7661,-1.039,1.2479,0.222,-0.714,2.6709,-0.275,0.168,0.9759,0.461,...,-0.3688,1.5408,3.5482,-0.6669,-0.777,1.015,0.1688,0.2689,-0.3498,-0.5491
2,-0.2721,-0.766,-0.748,-1.0371,0.0459,-0.294,0.046,0.932,-0.505,-0.099,...,0.2562,0.206,-0.8099,-0.6669,-0.777,0.372,-0.5152,-0.1861,-0.3498,-0.5491
3,-0.8201,-1.281,0.465,-0.198,0.8059,0.898,-0.954,-0.427,-0.505,1.332,...,-0.3688,-0.5718,-0.8099,-0.6669,-0.505,-0.543,0.1688,0.2689,-0.5978,-0.5491
4,0.0019,-1.188,-0.709,0.6421,0.2039,-0.35,-0.275,0.507,0.2349,-0.612,...,0.0612,1.8737,1.0388,-0.6669,0.4469,0.575,-0.2332,-0.4131,0.4472,-0.5491


In [17]:
# computing number of rows
rows = len(X.axes[0])

# computing number of columns
cols = len(X.axes[1])

print("Number of Rows (data points): ", rows)
print("Number of Columns (features or variables): ", cols)

Number of Rows (data points):  72
Number of Columns (features or variables):  135


In [18]:
Y = df['ctrl/case']

Y represents whether the individuals became psychotic (1) or not (0).  Y is a Pandas series.

In [19]:
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: ctrl/case, dtype: int64

In [20]:
Y.describe()

count    72.000000
mean      0.444444
std       0.500391
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: ctrl/case, dtype: float64

The individuals who did not progress to psychosis are labeled non_psychotic.

In [21]:
non_psychotic = Y[Y == 0]
non_psychotic.head()

0    0
1    0
2    0
3    0
4    0
Name: ctrl/case, dtype: int64

The individuals who progressed to psychosis are labeled pre_psychotic.

In [22]:
pre_psychotic = Y[Y == 1]

In [23]:
pre_psychotic.head()

40    1
41    1
42    1
43    1
44    1
Name: ctrl/case, dtype: int64

In [24]:
Y_names = Y.replace({0: 'non_psychotic', 1: 'pre_psychotic'})
Y_names

0     non_psychotic
1     non_psychotic
2     non_psychotic
3     non_psychotic
4     non_psychotic
          ...      
67    pre_psychotic
68    pre_psychotic
69    pre_psychotic
70    pre_psychotic
71    pre_psychotic
Name: ctrl/case, Length: 72, dtype: object