In [None]:
import os
import glob
from urllib.request import urlretrieve
import zipfile

import pandas as pd
import numpy as np

In [None]:
#### sort, glob, rank, iloc, loc, insert values when condition, isin
#### create a dataframe

### Creating dataframes from scratch

In [None]:
'''
create arrays 

gc : list
    A list of colors
g : array
    An array of colors picked form gc for color of points
x : array
    An array with values randomly picked from a normal distribution
y: array
    x plus randon noise
z : random ints from 5 to 100 for size of points
'''

gc = ['red', 'blue', 'pink', 'yellow', 'brown', 'magenta', 'green', 'orange']
g = np.random.choice(gc, 500)  # pick an element from gc 500 times
x = np.random.randn(500) # pick values from a random distribution
y = x + .5*np.random.randn(500) # x plus noise
z = np.random.randint(5, 100, 500) # random ints for size of points



In [None]:
'''
A pandas DataFrame can easily be created with a dictionary using a string name
as the key and an array or list as the value
'''

mydf = pd.DataFrame({'g':g, 'x':x, 'y':y, 'z':z})
mydf.head()

In [None]:
mydf.value_counts('g')

### DataFrame plots
Pandas has some plotting functionality built in:

To plot a histogram of a column:
```mydf.plot.hist(x)```

In [None]:
mydf['x'].plot.hist(bins=50)

In [None]:
'''
Use the columns in mydf to make a scatter plot with size of point using 'z'
and the color using 'g' 
'''

mydf.plot.scatter('x', 'y', s='z', c='g')

In [None]:
## insert a new column into the dataframe

mydf['dxy'] = mydf['x'] - mydf['y']
mydf['name'] = "Chris"  # This will create a nw column with all rows equal to 'Chris'
mydf

### Pandas DataFrames

A pandas dataframe is similar to a spreadsheet with data in columns and rows. It is common to have data in csv file or similar that can be
imported into python. Pandas can also read many other [types of files](https://pandas.pydata.org/pandas-docs/stable/reference/io.html) and convert them into dataframes (excel, html, xml, json, sql, ...)

- We have csv files from a screen, one csv file per plate [Plate002](https://www.dropbox.com/home/Work/PythonDataScience/data?preview=Plate002.csv)
- We also have a csv file that decribes each plates contents
- read data csv files into DataFrames
- combined individual DataFrames into one DataFrame
- clean the DataFrame
- create plate/well/object columns from other columns
- read the plate map csv
- clean the plate map
- combine the plate map and the data csv
- do some calculations



In [None]:
### Download files from dropbox

url = "https://www.dropbox.com/scl/fi/moabkzm7npb72i6kl2zn3/data.zip?rlkey=h2cl8b55yo1goj8bwfh0lof8n&dl=1"

urlretrieve(url, "data.zip")

with zipfile.ZipFile("data.zip", 'r') as zip:
    zip.extractall()

os.remove("data.zip")


In [None]:
### use os.listdir to see the files in the data directory

os.listdir('data')

In [None]:
### use glob to create a list of all csv files that start with "Plate"

files = sorted(glob.glob('data/Plate*.csv'))
files

In [None]:
'''
use a for loop to iterate the filename in file and pd.read_csv to import the csv data files

- create an empty list to put dataframes in
- append dataframe for each file into the list
- use pd.concat to concatenate the list of dataframes into on large dataframe
'''

dflist = list()
for f in files:
    _df = pd.read_csv(f)
    _df['csvfile'] = f
    dflist.append(_df)
    
df = pd.concat(dflist)


In [None]:
df.shape

In [None]:
''' different ways to look at the DataFrame

df.head(n) : print the first n lines of the dataframe - 5 by default 
df.tail(n) : print the last n line of the dataframe - 5 by default
df.sample(n) : print n random lines from the dataframe - 1 by default. 
df.columns : return the column names
'''

df.head(3)

In [None]:
### Use df.tail to see the last rows of the dataframe
df.tail(6)

In [None]:
'''
Use df.sample to see random rows from the dataframe 
Use an integer N as an argument to get N rows
Use a frac=number - a decimal less than one to get that fraction
'''
df.sample(10)

In [None]:
df.columns[0]

### unwanted columns

The Unnamed... are not needed, so I want to get rid of them. The DataFrame method `drop` can do that.


In [None]:
df = df.drop(df.columns[0], axis=1)
df.head()

In [None]:
## show df and look at the left column of numbers
## it would be nice to have that be number from 0 to N-1
## also look at df.index
## the index is the name of the rows

df

In [None]:
'''
Use reset_index to set the index column to a unique integer and
change the name of the other column to plate_index
'''

df = df.reset_index(names='plate_index')
df

In [None]:
df.loc[86]['File']

In [None]:
### use this to explore the df['File'].str options like upper, contains, split
df['File'].str.split('/')[0]

In [None]:
## look at string function split and get the batch ( [1] place of the split)

df['batch'] = df['File'].str.split("/").str[1]
df[['File', 'batch']].sample(4)

In [None]:
### Use dataframe str methods to make a plate column
df['plate'] = df['File'].str.split("/").str[2].str.split("_").str[0].str.replace("Plate", "").astype(int)
df[['File', 'plate']].sample(4)

In [None]:
### Exercise : do the same thing for the well

df['well'] = df['File'].str.split("/").str[2].str.split("_").str[1].str.replace("Well", "").astype(int)

In [None]:
df[['File', 'batch', 'plate', 'well']].sample(5)

### Mapping functions
I don't like stringing together all of the str.something.str....., I find using mapping functions "better"

A series from the dataframe has a map method. In this case there are 2 batches, we could create a map for those from a dictionary:%%!

```
batch_map = {
    "20240321_140300_322":"03/21/2024",
    "20240402_081806_268":04/02/2024"
}

df['batch_date'] = df['batch'].map(batch_map)
```
A function can also be used to operate on every entry in a column

In [None]:
batch_map = {
    "20240321_140300_322":"03/21/2024",
    "20240402_081806_268":"04/02/2024"
}

df['batch_date'] = df['batch'].map(batch_map)
df.sample(10)

In [None]:
## write a mapping function to create the object column 
## './20240321_140300_322/Plate002_Well1_Object0.tif_projection.tif'
##                                       ******^

def map_object(x):
    bn = os.path.basename(x)
    s = bn.split("_")[2]
    #s = s.replace("Object", "")
    s = s[6:]
    dot = s.index(".")
    strobj = s[:dot]
    obj = int(strobj)
    return obj

df['object'] = df['File'].map(map_object)

In [None]:
df[['File', 'batch', 'plate', 'well', 'object']].sample(6)

### Read plate map

Use pd.read_csv to read in the platemap

In [None]:
plate_map = pd.read_csv("data/20240321_Map.csv")

In [None]:
plate_map.shape

In [None]:
plate_map.sample(8)

In [None]:
'''
I like using lower case almost all the time, so the batch, plate, well
are slightly different.

Use the dataframe rename method to change the columns in the plate_map
'''
plate_map=plate_map.rename({"Batch":"batch", "Plate":"plate", "Well":"well"}, axis=1)

In [None]:
plate_map.columns, df.columns

### Merging

The dataframes df and plate_map can now be joined/merged together so
every row will have the measurements along with the slide and sample information

In [None]:
## Use the dataframe merge function to merge the dataframes

merged = df.merge(plate_map, on=['batch', 'plate', 'well'],
                  how='inner')

In [None]:
df.shape, merged.shape

In [None]:
# use how='left' and run this to see which rows in df don't have matches in plate_map
#merged[~pd.notna(merged['Sample'])]

In [None]:
merged.sample(2)

In [None]:
'''The sample column has information about feeding - food and time,
put these into separate columns'''

merged['Sample'].unique()

In [None]:
merged['food'] = merged['Sample'].str.split(" ").str[0]
merged['time_desc'] = merged['Sample'].str.split(" ").str[-1]

In [None]:
merged[['Sample', 'food', 'time_desc']]

In [None]:
merged['time_desc'].unique()

In [None]:
### write a mapping dictionary to convert the time_desc column into hours
### Caution: don't name variables time. There is a common built in library called time

feed_map = {
    '2h': 2,
    'unfed':0,
    '10min': 1./60.,
    '4h': 4, 
    '5d': 5*24,
    '6h': 6*24,
    '2d': 2*24,
    '3d': 3*24,
    '1d': 24,
    '7d': 7*24,
    '6d': 6*24,
    '4d': 4*24
}

merged['time'] = merged['time_desc'].map(feed_map)

In [None]:

### mapping function if you are interested

'''
def feedtime(stime):
    
    if stime == 'unfed':
        t = 0
    elif stime.endswith('d'):
        tmult = 24.
        tnum = float(stime[:-1])
        t = tnum*tmult
    elif stime.endswith('h'):
        tmult = 1.
        tnum = float(stime[:-1])
        t = tmult*tnum
    elif stime.endswith('min'):
        tmult = (1./60.)
        tnum = float(stime[:-3])
        t = tmult*tnum
    else:
        t = -1
    
    return t

merged['time'] = merged['time_desc'].map(feedtime)
'''
""        

In [None]:
merged.sample(10) #[['plate', 'well', 'food', 'time_desc', 'time']]

### Groupby and aggregation

We want to use the area and density measurements to compare the different foods and feeding times. Multiple measurements (objects) of each case were taken, so we want to use the mean or other statistics to describe each case. Groupby lets us break the dataframe into parts that go together and do calculations.

The groupby process needs the following:
- columns to group together
- what columns to do calculations on
- the functions to calculate

It might make sense to group things by ['plate', 'well']

In [None]:
## do a groupby on plate and well, use the object column to count
## then change the column to 'Density1' and the agg function to 'mean' or something else
merged.groupby(['plate'])['Density1'].agg('mean').reset_index()

In [None]:
## Groupby with ['food', 'time'] then use the columns Area, Density1, and Density2 to agg with mean

gmean = merged.groupby(["food", "time"])[['Area', 'Density1', 'Density2']].agg('max').reset_index()

In [None]:
gmean

In [None]:
gmean.sort_values(['food', 'time'])

In [None]:
gmean['rank'] = gmean.groupby(['food'])['Density1'].rank(ascending=False)

In [None]:
gmean.sort_values(['rank'])

In [None]:
merged

In [None]:
### Do this if there is time
### use transform to use all values of a group for a calculations


In [None]:
merged['norm_max'] = merged.groupby(['food', 'time'])['Density1'].transform(lambda x : x/x.max())

In [None]:
merged