In [1]:
import pandas as pd

In [22]:
def process_df(df):
    """Remove unwanted targets and add success criteria"""
    rename_columns = {'shelxeAvgChainLength' : 'SHELXE_ACL',
                        'shelxeCC' : 'SHELXE_CC',
                        'buccFinalRfree' : 'SXRBUCC_final_Rfree',
                        'arpWarpFinalRfree' : 'SXRARP_final_Rfree',
                        'rfree' : 'REFMAC_Rfree',
                        'pdbCode' : 'native_pdb_code',
                        'resolution' : 'native_pdb_resolution',
                        'numResidues' : 'native_pdb_num_residues',
                        'ensembleName' : 'ensemble_name',
                        'ensembleNumResidues' : 'num_residues',
                        'ensembleNumModels' : 'subcluster_num_models',
                        'phaserLLG' : 'PHASER_LLG',
                        'phaserTFZ' : 'PHASER_TFZ',
                        'spaceGroup' : 'space_group',
                        'solventContent' : 'solvent_content',
                        'fastaLength' : 'chain_length',
                        'estChainsASU' : 'num_chains',
                       }
    df.rename(columns=rename_columns, inplace=True)
    df['SHELXE_OK'] = (df.SHELXE_CC >= 25) & (df.SHELXE_ACL >= 10)
    # or use df_unsucc = df.query("successful == 0")
    df[['SXRARP_final_Rfree', 'SXRBUCC_final_Rfree']] = df[['SXRARP_final_Rfree', 'SXRBUCC_final_Rfree']].fillna(1.0)
    df['MIN_RFREE'] = df[['SXRARP_final_Rfree', 'SXRBUCC_final_Rfree']].min(axis=1)
    df['REBUILD_OK'] = df['MIN_RFREE'] < 0.45
    df['REFMAC_OK'] = df['REFMAC_Rfree'] <= 0.45
    df['PHASER_OK'] = (df.PHASER_TFZ >= 8.0) | (df.PHASER_LLG > 120)
    df['success'] = (df.SHELXE_OK  & df.REBUILD_OK)
#     df = df[-df['native_pdb_code'].isin(['2BL2','2UUI','2X2V'])]
    return df


In [23]:
def summarise_df(df):
    df.sort_values(['native_pdb_code', 'success', 'SHELXE_CC'], ascending=[1,0,0], inplace=True)
    group = df.groupby(['native_pdb_code'])
    sdf = group.nth(0)[['native_pdb_resolution', 'native_pdb_num_residues', 'ensemble_name', 'space_group', 'chain_length', 'num_chains', 'solvent_content', 'subcluster_num_models','PHASER_LLG','PHASER_TFZ', 'SHELXE_CC', 'SHELXE_ACL','SXRBUCC_final_Rfree','SXRARP_final_Rfree']]
    sdf['success'] = group['success'].sum().astype(int)
    sdf['num_models'] = group['native_pdb_code'].size().astype(int)
    sdf['PHASER_OK'] = group['PHASER_OK'].sum().astype(int)
    sdf['SHELXE_OK'] = group['SHELXE_OK'].sum().astype(int)
    sdf['REBUILD_OK'] = group['REBUILD_OK'].sum().astype(int)
    # Add native_pdb_code column and reset index to ints
    sdf.reset_index('native_pdb_code', inplace=True)
    return sdf

In [25]:
fname = '/media/scratch/coiled-coil/final_results/final_results.csv'
fname = 'final_results.csv'
df = pd.read_csv(fname)
df = process_df(df)
df_summary = summarise_df(df)
df_summary.sort_values(['success'],ascending=False, inplace=True)

In [26]:
columns = ['native_pdb_code', 'native_pdb_resolution', 'space_group', 'solvent_content', 'chain_length', 'num_chains', 'native_pdb_num_residues']
df_summary.loc[df_summary['success'] == 0, columns].to_csv('todo.csv', index=False)


In [27]:
dfi = pd.read_excel('/Users/jmht/Downloads/cb5097sup2.xlsx')

In [43]:
df_fail = df_summary[df_summary['success'] == 0]
# dfi[dfi['STRUCTURE'].isin(df_summary['native_pdb_code'].values)]
dfif = dfi[dfi['STRUCTURE'].isin(df_fail['native_pdb_code'].values)]
dfif.sort_values(['STRUCTURE'], ascending=True, inplace=True)
dfif.to_csv('foo.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
