# Introduction to Python - Session 3
1. Installing and using packages
2. Regular expressions: the re package
3. Data analysis:
    - The numpy package
    - The pandas package

## EXERCISE 1 - Regular expressions

In [6]:
import re

**1. Extract numbers from `myString`.**

In [11]:
myString = 'Hello 12 hi 89. How are you 34?'
pattern = '[0-9]+'

result = re.findall(pattern, myString) 
print(result)

['12', '89', '34']


**2. Split a comma-delimited enumaration of `myFruits` into a list.**

In [12]:
myFruits = "banana, apple, melon, strawberry, peach, grapes"
pattern = ", "

result = re.split(pattern, myFruits)
print(result)

['banana', 'apple', 'melon', 'strawberry', 'peach', 'grapes']


**3. Replace, for all items in the list, all `.` by `-`.**

In [16]:
myList = ["Spain.2006", "Belgium.2015", "Portugal.2001"]
pattern = "\." # Remember: use \ to escape the meaning of . as a metacharacter.
replacement = "-"

result = [re.sub(pattern, replacement, s) for s in myList]
print(result)

['Spain-2006', 'Belgium-2015', 'Portugal-2001']


**4. Detect whether "Python" is present at the beginning of `myString`. and print the result.**

In [17]:
myString = "Python is fun"

# check if 'Python' is at the beginning
match = re.search('^Python', myString)

if match:
  print("Pattern found inside the string")
else:
  print("Pattern not found")  


Pattern found inside the string


**5. From the following list of file names, extract a list that contains only the sample id "sampleX".** Use the following expressions if needed: 

**`(?=...)`** = Matches if the pattern is followed by ...

**`(?<=...)`** = Matches if the pattern is preceded by ...

In [23]:
# Option 1 - Match the exact pattern of the string
filenames = ["L2_sample1_GTAGCG.fastq.gz", "L1_sample2_ATTGCC.fastq.gz", 
             "L1_sample3_TGTTAC.fastq.gz", "L4_sample4_ATGGTA.fastq.gz"]
pattern = "(?<=L[1-4]_)sample[1-4](?=_[AGCT]{6}.fastq.gz)"

result = [re.findall(pattern, s) for s in filenames]
print(result)


# Option 2 - Split string by "_" and get the second element
filenames = ["L2_sample1_GTAGCG.fastq.gz", "L1_sample2_ATTGCC.fastq.gz", 
             "L1_sample3_TGTTAC.fastq.gz", "L4_sample4_ATGGTA.fastq.gz"]
pattern = "_"

result = [re.split(pattern, s)[1] for s in filenames]
print(result)

[['sample1'], ['sample2'], ['sample3'], ['sample4']]
['sample1', 'sample2', 'sample3', 'sample4']


## EXERCISE 2 - Introduction to NumPy

In [1]:
import numpy as np

**1. Create an array `a` of random numbers and shape (3,4).**

In [18]:
a = np.random.random([3,4])
print(a)

[[0.71182358 0.27138085 0.43316372 0.74481914]
 [0.96771308 0.00871517 0.676256   0.26688211]
 [0.49560982 0.51423813 0.62729738 0.72659303]]


**2. Add a fifth column to `a` with values 0, 0.5, and 1.**

In [19]:
a = np.append(a,[[0], [0.5], [1]], axis = 1)
print(a)

[[0.71182358 0.27138085 0.43316372 0.74481914 0.        ]
 [0.96771308 0.00871517 0.676256   0.26688211 0.5       ]
 [0.49560982 0.51423813 0.62729738 0.72659303 1.        ]]


**3. Find all values that are greater or equal to 0.5.**

In [20]:
print(a[a>=0.5])

[0.71182358 0.74481914 0.96771308 0.676256   0.5        0.51423813
 0.62729738 0.72659303 1.        ]


**4. Replace all the first row with NAs.**

In [26]:
a[0,:] = np.nan
print(a)

[[       nan        nan        nan        nan        nan]
 [0.96771308 0.00871517 0.676256   0.26688211 0.5       ]
 [0.49560982 0.51423813 0.62729738 0.72659303 1.        ]]


**5. Use matrix multiplication against the vector `b = np.array([1, 0, 10])`.**

In [27]:
b = np.array([1, 0, 10, 2, 0])
print(np.dot(a,b))

[       nan 8.26403727 8.22176968]


**6. Element-wise multiplication of the same vectors `a` and `b`.** Note that `b` is broadcasted along all rows.

In [28]:
print(a*b)

[[       nan        nan        nan        nan        nan]
 [0.96771308 0.         6.76255997 0.53376421 0.        ]
 [0.49560982 0.         6.2729738  1.45318606 0.        ]]


**7. Calculate the sum, the mean, and the median of each row of `a`. Use the so-called numpy functions.**

In [29]:
print(np.sum(a, axis =1))
print(np.mean(a, axis =1))
print(np.median(a, axis =1))

[       nan 2.41956635 3.36373836]
[       nan 0.48391327 0.67274767]
[       nan 0.5        0.62729738]


## EXERCISE 3 - Introduction to Pandas

In [30]:
import pandas as pd

**1. Create the following DataFrame `mydf`, with index `John, Jessica, Steve, Rachel` and columns `Age, Height, Sex`.**

```
43 	181 	M
34 	172 	F
22 	189 	M
27 	167 	F
```

In [44]:
mydf = pd.DataFrame([[43, 181, "M"],[34, 172, "F"],[22, 189, "M"],[27, 167, "F"]], 
                    index = ["John", "Jessica", "Steve", "Rachel"],
                    columns = ["Age", "Height", "Sex"])
mydf

Unnamed: 0,Age,Height,Sex
John,43,181,M
Jessica,34,172,F
Steve,22,189,M
Rachel,27,167,F


**2. What is the shape of `mydf`?**

In [45]:
mydf.shape

(4, 3)

**3. Calculate the average age and height in `mydf`.**

In [46]:
mydf.mean(axis = 0)

Age        31.50
Height    177.25
dtype: float64

**4. Add one row to `mydf`: Georges who is 53 years old and 168cm tall.**

In [47]:
mydf.loc["Georges"] = [53, 168, "M"]
mydf

Unnamed: 0,Age,Height,Sex
John,43,181,M
Jessica,34,172,F
Steve,22,189,M
Rachel,27,167,F
Georges,53,168,M


**5. Change the row names of `mydf` so that the data becomes anonymous.** Use Patient1, Patient2, etc. instead of actual names.

In [48]:
mydf.index = ["Patient1","Patient2","Patient3","Patient4","Patient5"]
mydf

Unnamed: 0,Age,Height,Sex
Patient1,43,181,M
Patient2,34,172,F
Patient3,22,189,M
Patient4,27,167,F
Patient5,53,168,M


**6. Create the DataFrame `mydf2` that is a subset of `mydf` containing only the female entries.**

In [49]:
mydf2 = mydf.loc[mydf.Sex=="F",:]
mydf2

Unnamed: 0,Age,Height,Sex
Patient2,34,172,F
Patient4,27,167,F


**7. Import the data in `more_patients.tsv` in a DataFrame named `moredf`.**

In [53]:
moredf = pd.read_csv("data/more_patients.tsv", index_col = 0, sep="\t")
moredf

Unnamed: 0,Age,Height,Sex
Patient6,18,175,M
Patient7,60,170,F
Patient8,36,169,F


**8. Create a DataFrame `mydf3` by concatenating `mydf` and `moredf`.**

In [51]:
mydf3 = pd.concat([mydf,moredf],axis=0)
mydf3

Unnamed: 0,Age,Height,Sex
Patient1,43,181,M
Patient2,34,172,F
Patient3,22,189,M
Patient4,27,167,F
Patient5,53,168,M
Patient6,18,175,M
Patient7,60,170,F
Patient8,36,169,F
